## Library

In [1]:
import pandas as pd
import os
import os.path as osp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Data

In [2]:
data_dir = '../dataset/housing'

train = pd.read_csv(osp.join(data_dir, 'train.csv'))
test = pd.read_csv(osp.join(data_dir, 'test.csv'))

train.drop('id', axis=1, inplace=True) # id 제거
test.drop('id', axis=1, inplace=True) # id 제거
print(train.shape, test.shape)

train.head()

(1350, 14) (1350, 13)


Unnamed: 0,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,target
0,10,2392,Ex,3,968,Ex,2392,2392,Ex,2,2003,2003,2003,386250
1,7,1352,Gd,2,466,Gd,1352,1352,Ex,2,2006,2007,2006,194000
2,5,900,TA,1,288,TA,864,900,TA,1,1967,1967,1967,123000
3,5,1174,TA,2,576,Gd,680,680,TA,1,1900,2006,2000,135000
4,7,1958,Gd,3,936,Gd,1026,1026,Gd,2,2005,2005,2005,250000


## Preprocessing

In [3]:
# 중복값 제거
print("제거 전 :", train.shape)
train = train.drop_duplicates()
print("제거 후 :", train.shape)

제거 전 : (1350, 14)
제거 후 : (1349, 14)


In [4]:
# train[train['Garage Yr Blt']> 2050] # 254
train.loc[254, 'Garage Yr Blt'] = 2007

In [5]:
# 품질 관련 변수 → 숫자로 매핑
qual_cols = train.dtypes[train.dtypes == np.object].index
def label_encoder(df_, qual_cols):
    df = df_.copy()
    mapping={
        'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1
    }
    for col in qual_cols :
        df[col] = df[col].map(mapping)
    return df

train = label_encoder(train, qual_cols)
test = label_encoder(test, qual_cols)
train.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Exter Qual,Garage Cars,Garage Area,Kitchen Qual,Total Bsmt SF,1st Flr SF,Bsmt Qual,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,target
0,10,2392,5,3,968,5,2392,2392,5,2,2003,2003,2003,386250
1,7,1352,4,2,466,4,1352,1352,5,2,2006,2007,2006,194000
2,5,900,3,1,288,3,864,900,3,1,1967,1967,1967,123000
3,5,1174,3,2,576,4,680,680,3,1,1900,2006,2000,135000
4,7,1958,4,3,936,4,1026,1026,4,2,2005,2005,2005,250000


### 로그 변환

In [6]:
print(f'로그 변환 전 타겟 왜도 = {train.target.skew()} / 로그 변환 후 타겟 왜도 = {np.log1p(train.target).skew()}')

로그 변환 전 타겟 왜도 = 1.7205733874129123 / 로그 변환 후 타겟 왜도 = 0.08224141935772546


In [7]:
X = train.drop('target', axis = 1)
y = np.log1p(train.target)

target = test[X.columns]

In [8]:
target.fillna(target.mean(), inplace = True)

## 파생 변수

In [9]:
def feature_eng(data_):
    data = data_.copy()
    data['Year Gap Remod'] = data['Year Remod/Add'] - data['Year Built']
    data['Car Area'] = data['Garage Area']/data['Garage Cars']
    data['2nd flr SF'] = data['Gr Liv Area'] - data['1st Flr SF']
    data['2nd flr'] = data['2nd flr SF'].apply(lambda x : 1 if x > 0 else 0)
    data['Total SF'] = data[['Gr Liv Area',"Garage Area", "Total Bsmt SF"]].sum(axis=1)
    data['Sum Qual'] = data[["Exter Qual", "Kitchen Qual", "Overall Qual"]].sum(axis=1)
    data['Garage InOut'] = data.apply(lambda x : 1 if x['Gr Liv Area'] != x['1st Flr SF'] else 0, axis=1)
    return data

train = feature_eng(train)
test = feature_eng(test)

## Modeling

In [22]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [11]:
# 평가 기준 정의
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [12]:
nmae_score = make_scorer(NMAE, greater_is_better=False)
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [13]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# LinearRegression
lr_pred = np.zeros(target.shape[0])
lr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    lr = LinearRegression(normalize=True)
    lr.fit(tr_x, tr_y)
    
    val_pred = np.expm1(lr.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    lr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = lr.predict(target) / 10
    lr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(lr_val)} & std = {np.std(lr_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08516773299438772

2 FOLD Training.....
2 FOLD NMAE = 0.1037334379169279

3 FOLD Training.....
3 FOLD NMAE = 0.09347602011178413

4 FOLD Training.....
4 FOLD NMAE = 0.11801350027022185

5 FOLD Training.....
5 FOLD NMAE = 0.08115283407751611

6 FOLD Training.....
6 FOLD NMAE = 0.10994532937271366

7 FOLD Training.....
7 FOLD NMAE = 0.09618690647772615

8 FOLD Training.....
8 FOLD NMAE = 0.08980927039476311

9 FOLD Training.....
9 FOLD NMAE = 0.09992001517045702

10 FOLD Training.....
10 FOLD NMAE = 0.0996037250804027

10FOLD Mean of NMAE = 0.09770087718669004 & std = 0.010586797791535223


In [14]:
# Ridge
rg_pred = np.zeros(target.shape[0])
rg_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    rg = Ridge()
    rg.fit(tr_x, tr_y)
    
    val_pred = np.expm1(rg.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    rg_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = rg.predict(target) / 10
    rg_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(rg_val)} & std = {np.std(rg_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08518848373299504

2 FOLD Training.....
2 FOLD NMAE = 0.10373385528863602

3 FOLD Training.....
3 FOLD NMAE = 0.09347833918874456

4 FOLD Training.....
4 FOLD NMAE = 0.11804341045588934

5 FOLD Training.....
5 FOLD NMAE = 0.0811462844719415

6 FOLD Training.....
6 FOLD NMAE = 0.10990169890105458

7 FOLD Training.....
7 FOLD NMAE = 0.09615468462531229

8 FOLD Training.....
8 FOLD NMAE = 0.08979721936963668

9 FOLD Training.....
9 FOLD NMAE = 0.09990731441863476

10 FOLD Training.....
10 FOLD NMAE = 0.09960676563692862

10FOLD Mean of NMAE = 0.09769580560897734 & std = 0.010587156980456586


In [15]:
# Lasso
ls_pred = np.zeros(target.shape[0])
ls_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    ls = Lasso()
    ls.fit(tr_x, tr_y)
    
    val_pred = np.expm1(ls.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    ls_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = ls.predict(target) / 10
    ls_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(ls_val)} & std = {np.std(ls_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.11014363861789106

2 FOLD Training.....
2 FOLD NMAE = 0.12859231598969093

3 FOLD Training.....
3 FOLD NMAE = 0.11696568675758723

4 FOLD Training.....
4 FOLD NMAE = 0.14546443450456834

5 FOLD Training.....
5 FOLD NMAE = 0.117609881931101

6 FOLD Training.....
6 FOLD NMAE = 0.11848164201239463

7 FOLD Training.....
7 FOLD NMAE = 0.1176699980965536

8 FOLD Training.....
8 FOLD NMAE = 0.09794562799080352

9 FOLD Training.....
9 FOLD NMAE = 0.11839055793197424

10 FOLD Training.....
10 FOLD NMAE = 0.13895906935473457

10FOLD Mean of NMAE = 0.1210222853187299 & std = 0.01296166843443742


In [16]:
# ElasticNet
el_pred = np.zeros(target.shape[0])
el_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    el = ElasticNet()
    el.fit(tr_x, tr_y)
    
    val_pred = np.expm1(el.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    el_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = el.predict(target) / 10
    el_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(el_val)} & std = {np.std(el_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.10461174332147474

2 FOLD Training.....
2 FOLD NMAE = 0.12496946106957815

3 FOLD Training.....
3 FOLD NMAE = 0.11391873661792358

4 FOLD Training.....
4 FOLD NMAE = 0.14396580852288096

5 FOLD Training.....
5 FOLD NMAE = 0.10916888331837661

6 FOLD Training.....
6 FOLD NMAE = 0.11474280678068112

7 FOLD Training.....
7 FOLD NMAE = 0.10890616344143171

8 FOLD Training.....
8 FOLD NMAE = 0.09590813203234295

9 FOLD Training.....
9 FOLD NMAE = 0.1168680190598666

10 FOLD Training.....
10 FOLD NMAE = 0.1311141226888767

10FOLD Mean of NMAE = 0.11641738768534331 & std = 0.013138721886320266


In [17]:
# GradientBoostingRegressor
gbr_pred = np.zeros(target.shape[0])
gbr_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    gbr = GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate = 0.05, n_estimators = 1000)
    gbr.fit(tr_x, tr_y)
    
    val_pred = np.expm1(gbr.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    gbr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = gbr.predict(target) / 10
    gbr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(gbr_val)} & std = {np.std(gbr_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08973474875044812

2 FOLD Training.....
2 FOLD NMAE = 0.09342373840136703

3 FOLD Training.....
3 FOLD NMAE = 0.09635404984993284

4 FOLD Training.....
4 FOLD NMAE = 0.12391731285331489

5 FOLD Training.....
5 FOLD NMAE = 0.0954282024344578

6 FOLD Training.....
6 FOLD NMAE = 0.10285604236696992

7 FOLD Training.....
7 FOLD NMAE = 0.09251126055360143

8 FOLD Training.....
8 FOLD NMAE = 0.09576184784091606

9 FOLD Training.....
9 FOLD NMAE = 0.1035561882746765

10 FOLD Training.....
10 FOLD NMAE = 0.10218753408873578

10FOLD Mean of NMAE = 0.09957309254144205 & std = 0.009233981537260185


In [18]:
# RandomForestRegressor
rf_pred = np.zeros(target.shape[0])
rf_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
    rf.fit(tr_x, tr_y)
    
    val_pred = np.expm1(rf.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    rf_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = rf.predict(target) / 10
    rf_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(rf_val)} & std = {np.std(rf_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.0922156220711973

2 FOLD Training.....
2 FOLD NMAE = 0.10043356835559464

3 FOLD Training.....
3 FOLD NMAE = 0.10055273007851406

4 FOLD Training.....
4 FOLD NMAE = 0.12538377681850285

5 FOLD Training.....
5 FOLD NMAE = 0.09026369730237521

6 FOLD Training.....
6 FOLD NMAE = 0.09540843393029043

7 FOLD Training.....
7 FOLD NMAE = 0.08821015637488719

8 FOLD Training.....
8 FOLD NMAE = 0.09083184806061635

9 FOLD Training.....
9 FOLD NMAE = 0.10166828818352941

10 FOLD Training.....
10 FOLD NMAE = 0.10294099136490703

10FOLD Mean of NMAE = 0.09879091125404145 & std = 0.010203790026087891


In [19]:
# NGBRegressor
ngb_pred = np.zeros(target.shape[0])
ngb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    ngb = NGBRegressor(random_state = 42, n_estimators = 1000, verbose = 0, learning_rate = 0.03)
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 300)
    
    val_pred = np.expm1(ngb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    ngb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = ngb.predict(target) / 10
    ngb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(ngb_val)} & std = {np.std(ngb_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08540200060927182

2 FOLD Training.....
2 FOLD NMAE = 0.09392208071849062

3 FOLD Training.....
3 FOLD NMAE = 0.09342973709603332

4 FOLD Training.....
4 FOLD NMAE = 0.1158158053273631

5 FOLD Training.....
5 FOLD NMAE = 0.08613295738895065

6 FOLD Training.....
6 FOLD NMAE = 0.09519635820490327

7 FOLD Training.....
7 FOLD NMAE = 0.09119477028522797

8 FOLD Training.....
8 FOLD NMAE = 0.09006259727788231

9 FOLD Training.....
9 FOLD NMAE = 0.09642556796539417

10 FOLD Training.....
10 FOLD NMAE = 0.10160829361550165

10FOLD Mean of NMAE = 0.09491901684890189 & std = 0.008319879236830889


In [20]:
# Catboost
cb_pred = np.zeros(target.shape[0])
cb_val = []
for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)) :
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    
    cb = CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0)
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 750, verbose = 1000)
    
    val_pred = np.expm1(cb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred)
    cb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    target_data = Pool(data = target, label = None)
    fold_pred = cb.predict(target) / 10
    cb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(cb_val)} & std = {np.std(cb_val)}')

1 FOLD Training.....
0:	learn: 0.2930177	test: 187886.6144612	best: 187886.6144612 (0)	total: 139ms	remaining: 6m 57s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 187886.5491
bestIteration = 142

Shrink model to first 143 iterations.
1 FOLD NMAE = 0.09278793165778815

2 FOLD Training.....
0:	learn: 0.2952100	test: 183672.7368567	best: 183672.7368567 (0)	total: 1.9ms	remaining: 5.69s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 183672.7231
bestIteration = 91

Shrink model to first 92 iterations.
2 FOLD NMAE = 0.119763220923516

3 FOLD Training.....
0:	learn: 0.2873214	test: 190826.8660616	best: 190826.8660616 (0)	total: 1.88ms	remaining: 5.63s
Stopped by overfitting detector  (750 iterations wait)

bestTest = 190826.8025
bestIteration = 156

Shrink model to first 157 iterations.
3 FOLD NMAE = 0.10428467631314435

4 FOLD Training.....
0:	learn: 0.2923176	test: 176271.9526156	best: 176271.9526156 (0)	total: 1.84ms	remaining: 5.53s
Stopped by ove

In [23]:
# 초기화
xgb_pred = np.zeros(target.shape[0])
xgb_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    # xgb 학습
    xgb = XGBRegressor(random_state = 42, criterion = 'mae')
    xgb.fit(tr_x, tr_y)
    
    val_pred = np.expm1(xgb.predict(val_x))
    val_nmae = NMAE(val_y, val_pred) # NMAE
    xgb_val.append(val_nmae) # 초기화한 리스트에 NMAE를 삽입.
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = xgb.predict(target) / 10
    xgb_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(xgb_val)} & std = {np.std(xgb_val)}')

1 FOLD Training.....
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


1 FOLD NMAE = 0.09585665006796414

2 FOLD Training.....
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


2 FOLD NMAE = 0.10365404597865192

3 FOLD Training.....
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but 

In [24]:
# 초기화
lgbm_pred = np.zeros(target.shape[0])
lgbm_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f'{n + 1} FOLD Training.....')
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], np.expm1(y.iloc[val_idx])
    
    # xgb 학습
    lgbm = LGBMRegressor(random_state = 42, criterion = 'mae')
    lgbm.fit(tr_x, tr_y)
    
    val_pred = np.expm1(lgbm.predict(val_x))
    val_nmae = NMAE(val_y, val_pred) # NMAE
    lgbm_val.append(val_nmae) # 초기화한 리스트에 NMAE를 삽입.
    print(f'{n + 1} FOLD NMAE = {val_nmae}\n')
    
    fold_pred = lgbm.predict(target) / 10
    lgbm_pred += fold_pred

print(f'10FOLD Mean of NMAE = {np.mean(lgbm_val)} & std = {np.std(lgbm_val)}')

1 FOLD Training.....
1 FOLD NMAE = 0.08965525753168317

2 FOLD Training.....
2 FOLD NMAE = 0.09204765642380996

3 FOLD Training.....
3 FOLD NMAE = 0.10725799296644017

4 FOLD Training.....
4 FOLD NMAE = 0.12752712689735532

5 FOLD Training.....
5 FOLD NMAE = 0.09191914851241564

6 FOLD Training.....
6 FOLD NMAE = 0.09912622286935108

7 FOLD Training.....
7 FOLD NMAE = 0.08867373084975455

8 FOLD Training.....
8 FOLD NMAE = 0.0907089480909202

9 FOLD Training.....
9 FOLD NMAE = 0.10531487169660025

10 FOLD Training.....
10 FOLD NMAE = 0.10295641978149356

10FOLD Mean of NMAE = 0.0995187375619824 & std = 0.01138428061682549


In [26]:
# 검증 성능 확인하기
val_list = [lr_val, rg_val, ls_val, el_val, gbr_val, rf_val, ngb_val, cb_val, xgb_val, lgbm_val]
for val in val_list :
    print("{:.8f}".format(np.mean(val))) 

0.09770088
0.09769581
0.12102229
0.11641739
0.09957309
0.09879091
0.09491902
0.10810863
0.10377519
0.09951874


0.09491902 + 0.09879091 + 0.09769581 + 0.09957309

In [34]:
# submission 파일에 입력
sub = pd.read_csv(osp.join(data_dir, 'sample_submission.csv'))
sub['target'] = np.expm1((ngb_pred + rf_pred + rg_pred + gbr_pred + lgbm_pred) / 5)
sub['target']

0       336185.372002
1       126674.570725
2       174364.865468
3       238372.108291
4       129522.161789
            ...      
1345    329908.652112
1346    124872.015850
1347     77339.468512
1348    197182.081454
1349    140603.137902
Name: target, Length: 1350, dtype: float64

In [35]:
# csv 파일로 내보내기
sub.to_csv('./11th.csv', index=False) 