In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## 데이터 수정

In [3]:
X_train = df_train.drop(['id', 'price'], axis=1)
y_train = np.log1p(df_train['price'])
X_test = df_test.drop(['id'], axis=1)

In [4]:
# Adding features
for df in [X_train, X_test]:
    df['date(new)'] = df['date'].apply(lambda x: int(x[4:8])+800 if x[:4] == '2015' else int(x[4:8])-400)
    df['how_old'] = df['date'].apply(lambda x: x[:4]).astype(int) - df[['yr_built', 'yr_renovated']].max(axis=1)
    del df['date']
    del df['yr_renovated']
    df['yr_built'] = df['yr_built'] - 1900
    df['sqft_floor'] = df['sqft_above'] / df['floors']
    df['floor_area_ratio'] = df['sqft_living'] / df['sqft_lot']
    del df['sqft_lot15']

In [5]:
# Log Scaling
log_features = ['bedrooms', 'bathrooms', 'sqft_lot', 'sqft_living', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_floor', 'floor_area_ratio', 'floor_area_ratio']
for feature in log_features:
    for df in [X_train, X_test]:
        df[feature] = np.log1p(df[feature])

In [6]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, RandomizedSearchCV, GridSearchCV, StratifiedKFold
import xgboost as xgb

In [7]:
# feval function에 사용하기 위해
def rmse_exp(predictions, dmat):
    labels = dmat.get_label()
    error = np.expm1(predictions) - np.expm1(labels)
    squared_error = np.square(error)
    mean = np.mean(squared_error)
    return ('rmse_exp', np.sqrt(mean))

In [15]:
xgb_model = xgb.XGBRegressor()

In [16]:
params = {
    'learning_rate':[0.5, 0.1, 0.05, 0.01]
    ,'n_estimators':[3000]
    ,'num_boost_round':[1000, 3000, 5000, 10000]
    ,'early_stopping_rounds':[50, 100, 500, 1000]
    ,'max_depth':[5, 10, 30, 50]
    ,'objective': ['reg:linear']
    ,'eval_metric': ['rmse']
    ,'silent': [True]
    ,'subsample':[0.7]
}

In [19]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

In [None]:
folds = 3
param_comb = 5
clf = GridSearchCV(xgb_model
                   ,params
                   ,n_jobs=4
                   ,cv=5
                   ,verbose=2, refit=True)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 32.1min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 144.4min


In [None]:
best_est = clf.best_estimator_
print(best_est)