In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## 데이터 수정

In [4]:
for df in [df_train,df_test]:
    # df['date'] = df['date'].apply(lambda x: x[0:8])
    # 일짜까지 되있는걸 월까지
    df['date'] = df['date'].apply(lambda x: x[0:6])
    df['yr_renovated'] = df['yr_renovated'].apply(lambda x: np.nan if x == 0 else x)
    df['yr_renovated'] = df['yr_renovated'].fillna(df['yr_built'])
for df in [df_train]:
    # 1000단위 대로 반올림
    df['price'] = df['price'].apply(lambda x: round(x, -3))

In [5]:
for df in [df_train,df_test]:
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df['sqft_ratio'] = df['sqft_living'] / df['sqft_lot']
    df['sqft_total_size'] = df['sqft_above'] + df['sqft_basement']
    df['sqft_ratio15'] = df['sqft_living15'] / df['sqft_lot15'] 
    df['is_renovated'] = df['yr_renovated'] - df['yr_built']
    df['is_renovated'] = df['is_renovated'].apply(lambda x: 0 if x == 0 else 1)
    df['date'] = df['date'].astype('int')

In [6]:
df_train['per_price'] = df_train['price']/df_train['sqft_total_size']
zipcode_price = df_train.groupby(['zipcode'])['per_price'].agg({'mean','var'}).reset_index()
df_train = pd.merge(df_train,zipcode_price,how='left',on='zipcode')
df_test = pd.merge(df_test,zipcode_price,how='left',on='zipcode')

for df in [df_train,df_test]:
    df['mean'] = df['mean'] * df['sqft_total_size']
    df['var'] = df['var'] * df['sqft_total_size']

### 라벨 인코더

In [7]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
LE.fit(df_train[['date']])
df_train[['date']] = LE.transform(df_train[['date']])
LE.fit(df_train[['zipcode']])
df_train[['zipcode']] = LE.transform(df_train[['zipcode']])

### 상호작용변수 추가

### 월별 + ZIPCODE를 활용한 더미변수 추가하여 테스트
위도 + 경도로 위치값을 대신하고 싶었지만 어떻게 구역을 나눠야 할지 모르기때문에 ZIPCODE로 대신함

df_train_length = len(df_train)
concat_dataset = pd.concat(objs=[df_train, df_test], axis=0)

concat_dataset['date_zipcode'] = concat_dataset['date'].map(str) + "_" + concat_dataset['zipcode'].map(str)
dataset_preprocessed = pd.get_dummies(concat_dataset, columns=['date', 'zipcode','date_zipcode'])

df_train = dataset_preprocessed[:df_train_length]
df_test = dataset_preprocessed[df_train_length:]

## XGboost

In [8]:
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
import xgboost as xgb

In [9]:
train_columns = [c for c in df_train.columns if c not in ['id','price','per_price']]
y_reg = df_train['price']

In [10]:
xgb_params = {
    'learning_rate': 0.01
    ,'n_estimators': 10000
    ,'max_depth': 7
    ,'subsample': 0.7
    ,'num_boost_round': 30
    ,'early_stopping_rounds': 5
    ,'eval_metric': 'rmse'
    ,'objective': 'reg:linear'
    ,'seed': 2019
}

In [None]:
model= xgb.XGBRegressor(
    # eta [default=0.3, alias: learning_rate] 학습율
    learning_rate=0.005,
    # 모형 갯수 default=100
    n_estimators=80001,
    random_state=2019,
    # 학습 횟수
    num_boost_round=10000, 
    # overfitting 방지
    early_stopping_rounds=5,
    verbose_eval=3000, 
    show_stdv=False,
    max_depth=7, 
    nthread=-1,
    tree_method='gpu_hist'
)

#prepare fit model with cross-validation
folds = KFold(n_splits=5, shuffle=True, random_state=2019)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train)):
    X_train,X_validation=df_train.iloc[trn_idx][train_columns],df_train.iloc[val_idx][train_columns]
    y_train,y_validation=y_reg.iloc[trn_idx],y_reg.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train)
    dtarget=xgb.DMatrix(y_train)
    dtest = xgb.DMatrix(X_validation)
    dtest_target=xgb.DMatrix(y_validation)
    
    model.fit(X_train,y_train
              ,eval_set=[(X_train,y_train),(X_validation,y_validation)]
              ,eval_metric='rmse'
              # log 얼마단위로 찍을건지
              ,verbose=2000
             )