In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## 데이터 수정

In [3]:
for df in [df_train,df_test]:
    df['date'] = df['date'].apply(lambda x: x[0:8])
    df['yr_renovated'] = df['yr_renovated'].apply(lambda x: np.nan if x == 0 else x)
    df['yr_renovated'] = df['yr_renovated'].fillna(df['yr_built'])

In [4]:
for df in [df_train,df_test]:
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df['sqft_ratio'] = df['sqft_living'] / df['sqft_lot']
    df['sqft_total_size'] = df['sqft_above'] + df['sqft_basement']
    df['sqft_ratio15'] = df['sqft_living15'] / df['sqft_lot15'] 
    df['is_renovated'] = df['yr_renovated'] - df['yr_built']
    df['is_renovated'] = df['is_renovated'].apply(lambda x: 0 if x == 0 else 1)
    df['date'] = df['date'].astype('int')

In [5]:
df_train['per_price'] = df_train['price']/df_train['sqft_total_size']
zipcode_price = df_train.groupby(['zipcode'])['per_price'].agg({'mean','var'}).reset_index()
df_train = pd.merge(df_train,zipcode_price,how='left',on='zipcode')
df_test = pd.merge(df_test,zipcode_price,how='left',on='zipcode')

for df in [df_train,df_test]:
    df['mean'] = df['mean'] * df['sqft_total_size']
    df['var'] = df['var'] * df['sqft_total_size']

## XGboost

In [8]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
import xgboost as xgb
from xgboost import XGBClassifier

In [9]:
train_columns = [c for c in df_train.columns if c not in ['id','price','per_price']]
y_reg = df_train['price']

In [16]:
#prepare fit model with cross-validation
folds = KFold(n_splits=10, shuffle=True, random_state=2019)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train)):
    X_train,X_validation=df_train.iloc[trn_idx][train_columns],df_train.iloc[val_idx][train_columns]
    y_train,y_validation=y_reg.iloc[trn_idx],y_reg.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train)
    dtarget=xgb.DMatrix(y_train)
    dtest = xgb.DMatrix(X_validation)
    dtest_target=xgb.DMatrix(y_validation)
    
    model= xgb.XGBRegressor(
        learning_rate=0.01,
        n_estimators=8000,
        random_state=2019,
        num_boost_round=1000, 
        early_stopping_rounds=2000,
        verbose_eval=1000, 
        show_stdv=False,
        nthread=5
    )
    
    model.fit(X_train,y_train
              ,eval_set=[(X_train,y_train),(X_validation,y_validation)]
              ,eval_metric='rmse'
              ,verbose=1000)

[0]	validation_0-rmse:645488	validation_1-rmse:689314


  if getattr(data, 'base', None) is not None and \


[1000]	validation_0-rmse:100728	validation_1-rmse:130650
[2000]	validation_0-rmse:90336	validation_1-rmse:126648
[3000]	validation_0-rmse:85256.4	validation_1-rmse:124521
[4000]	validation_0-rmse:83472.8	validation_1-rmse:123956
[5000]	validation_0-rmse:83286.9	validation_1-rmse:123979
[6000]	validation_0-rmse:83254.8	validation_1-rmse:123967
[7000]	validation_0-rmse:83251.7	validation_1-rmse:123965
[7999]	validation_0-rmse:83243.5	validation_1-rmse:123964
[0]	validation_0-rmse:645873	validation_1-rmse:686032
[1000]	validation_0-rmse:101789	validation_1-rmse:137135
[2000]	validation_0-rmse:91447.3	validation_1-rmse:129753
[3000]	validation_0-rmse:85615.5	validation_1-rmse:125927
[4000]	validation_0-rmse:81059	validation_1-rmse:124605
[5000]	validation_0-rmse:77656	validation_1-rmse:123480
[6000]	validation_0-rmse:75557.5	validation_1-rmse:122741
[7000]	validation_0-rmse:75046.2	validation_1-rmse:122380
[7999]	validation_0-rmse:75024.4	validation_1-rmse:122381
[0]	validation_0-rmse:6533

In [11]:
predictions=model.predict(df_test[train_columns])

In [15]:
submission = pd.read_csv("sample_submission.csv")
submission['price']=predictions
submission.to_csv('submission.csv',index=False)