In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## 데이터 수정

In [3]:
X_train = df_train.drop(['id', 'price'], axis=1)
y_train = np.log1p(df_train['price'])
X_test = df_test.drop(['id'], axis=1)

In [4]:
# Adding features
for df in [X_train, X_test]:
    df['date(new)'] = df['date'].apply(lambda x: int(x[4:8])+800 if x[:4] == '2015' else int(x[4:8])-400)
    df['how_old'] = df['date'].apply(lambda x: x[:4]).astype(int) - df[['yr_built', 'yr_renovated']].max(axis=1)
    del df['date']
    del df['yr_renovated']
    df['yr_built'] = df['yr_built'] - 1900
    df['sqft_floor'] = df['sqft_above'] / df['floors']
    df['floor_area_ratio'] = df['sqft_living'] / df['sqft_lot']
    del df['sqft_lot15']

In [5]:
# Log Scaling
log_features = ['bedrooms', 'bathrooms', 'sqft_lot', 'sqft_living', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_floor', 'floor_area_ratio', 'floor_area_ratio']
for feature in log_features:
    for df in [X_train, X_test]:
        df[feature] = np.log1p(df[feature])

In [6]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, RandomizedSearchCV, GridSearchCV, StratifiedKFold
import xgboost as xgb

In [7]:
# feval function에 사용하기 위해
def rmse_exp(predictions, dmat):
    labels = dmat.get_label()
    error = np.expm1(predictions) - np.expm1(labels)
    squared_error = np.square(error)
    mean = np.mean(squared_error)
    return ('rmse_exp', np.sqrt(mean))

In [8]:
xgb_model = xgb.XGBRegressor()

In [9]:
params = {
    'learning_rate':[0.01, 0.05, 0.1]
    ,'n_estimators':[10000]
    ,'num_boost_round':[30, 50, 100]  # 100 ej
    ,'early_stopping_rounds':[5, 10] # 10
    ,'max_depth':[7,8]
    ,'objective': ['reg:linear']
    ,'eval_metric': ['rmse']
    ,'silent': [True]
    ,'seed': [2019]
    ,'subsample':[0.7]
}

In [10]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

In [11]:
clf = GridSearchCV(xgb_model
                   ,params
                   ,n_jobs=4
                   ,cv=5
                   ,verbose=2, refit=True)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 55.2min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 178.0min
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed: 195.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [10000], 'num_boost_round': [30, 50, 100], 'early_stopping_rounds': [5, 10], 'max_depth': [7, 8], 'objective': ['reg:linear'], 'eval_metric': ['rmse'], 'silent': [True], 'seed': [2019], 'subsample': [0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [12]:
best_est = clf.best_estimator_
print(best_est)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=5, eval_metric='rmse',
       gamma=0, importance_type='gain', learning_rate=0.01,
       max_delta_step=0, max_depth=7, min_child_weight=1, missing=None,
       n_estimators=10000, n_jobs=1, nthread=None, num_boost_round=30,
       objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=2019, silent=True, subsample=0.7)


In [None]:
params = {
    'learning_rate':[0.01, 0.05, 0.1]
    ,'n_estimators':[10000]
    ,'num_boost_round':[30, 50, 100]  # 100 ej
    ,'early_stopping_rounds':[5, 10] # 10
    ,'max_depth':[7,8]
    ,'objective': ['reg:linear']
    ,'eval_metric': ['rmse']
    ,'silent': [True]
    ,'seed': [2019]
    ,'subsample':[0.7]
}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, early_stopping_rounds=5, eval_metric='rmse',
       gamma=0, importance_type='gain', learning_rate=0.01,
       max_delta_step=0, max_depth=7, min_child_weight=1, missing=None,
       n_estimators=10000, n_jobs=1, nthread=None, num_boost_round=30,
       objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=2019, silent=True, subsample=0.7)