In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('./data/kc_house_data.csv')
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


```
id: 집 고유아이디
date: 집이 팔린 날짜
price: 집 가격 (타겟변수)
bedrooms: 주택 당 침실 개수
bathrooms: 주택 당 화장실 개수
floors: 전체 층 개수
waterfront: 해변이 보이는지(0, 1)
condition: 집 청소상태 (1~5)
grade : King County grading system으로 인한 평점(1~13)
yr_bulit: 집이 지어진 년도
yr_renovated: 집이 리모델링 된 년도
zipcode: 우편번호
lat: 위도
long: 경도
```

In [4]:
ncar = data.shape[0]
nvar = data.shape[1]
print(ncar)
print(nvar)

21613
14


In [7]:
data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis=1, inplace=True)

### 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [9]:
featuer_columns = list(data.columns.difference(['price']))
X = data[featuer_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


### 학습데이터를 선형회귀모형에 적합 후 평가 데이터로 검증(Stats_Models)

In [10]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [11]:
sm_train_x = sm.add_constant(train_x, has_constant='add')
sm_model = sm.OLS(train_y, sm_train_x)
fitted_sm_model = sm_model.fit()
print(fitted_sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.595
Model:                            OLS   Adj. R-squared:                  0.595
Method:                 Least Squares   F-statistic:                     2776.
Date:                Mon, 17 Jan 2022   Prob (F-statistic):               0.00
Time:                        06:56:38   Log-Likelihood:            -2.0826e+05
No. Observations:               15129   AIC:                         4.165e+05
Df Residuals:                   15120   BIC:                         4.166e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         7.186e+06   1.73e+05     41.548   

  x = pd.concat(x[::order], 1)


In [12]:
sm_test_x = sm.add_constant(test_x, has_constant='add')
sm_model_predict = fitted_sm_model.predict(sm_test_x)

  x = pd.concat(x[::order], 1)


In [13]:
mean_squared_error(sm_model_predict, test_y) #MSE

57506100719.89744

In [14]:
sqrt(mean_squared_error(sm_model_predict, test_y)) #RMSE

239804.2967085816

## Bagging

In [33]:
bagging_predict_result = []

for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])]
    random_data_index = np.random.choice(data_index, train_x.shape[0])
    print(len(set(random_data_index))) # 데이터를 복원추출시, 전체 데이터의 약 63% 정도 추출
    
    sm_train_x = train_x.iloc[random_data_index, ]
    sm_train_y = train_y.iloc[random_data_index, ]

    sm_train_x = sm.add_constant(sm_train_x, has_constant='add')
    sm_model = sm.OLS(sm_train_y, sm_train_x)
    fitted_sm_model = sm_model.fit()
    pred = fitted_sm_model.predict(sm_test_x)
    
    bagging_predict_result.append(pred)
    print(sqrt(mean_squared_error(pred, test_y)))

9537
239856.57280207184
9602
240292.70616367337
9576
240025.36205059505
9593
240917.0961350019
9589
239839.47950455488
9544
240106.0207556538
9503
239905.70102744416
9602
239655.99027126358
9565
240863.28325012338


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


9519
239706.51646215824


  x = pd.concat(x[::order], 1)


In [34]:
bagging_predict_result[1]

735      5.660367e+05
2830     7.094837e+05
4106     1.095953e+06
16218    1.448106e+06
19964    6.965060e+05
             ...     
12606    6.084143e+05
14393    6.756988e+05
6899     3.268014e+05
85       8.937443e+05
21363    4.369498e+05
Length: 6484, dtype: float64

In [35]:
bagging_predict = []

for lst2_index in range(test_x.shape[0]):
    temp_predict = []

    for lst_index in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index])
    
    bagging_predict.append(np.mean(temp_predict))

In [36]:
sqrt(mean_squared_error(bagging_predict, test_y))

239851.11010302018

### 학습데이터를 선형회귀모형에 적합 후 평가데이터로 검증(Sckit-Learn)

In [37]:
from sklearn.linear_model import LinearRegression

In [39]:
regression_model = LinearRegression()
linear_model1 = regression_model.fit(train_x, train_y)

### Bagging을 이용하여 선형회귀모형에 적합 후 평가(Sampling 10번)

In [40]:
from sklearn.ensemble import BaggingRegressor

In [52]:
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=5)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x)

print(sqrt(mean_squared_error(predict2, test_y)))

239992.76356517134


In [58]:
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=30)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x)

print(sqrt(mean_squared_error(predict2, test_y)))

239720.7232084481


### 학습데이터를 의사결정나무모형에 적합 후 평가 데이터로 검증

In [59]:
from sklearn.tree import DecisionTreeRegressor

In [62]:
decision_tree_model = DecisionTreeRegressor()
tree_model = decision_tree_model.fit(train_x, train_y)
predict_tree = tree_model.predict(test_x)

print(sqrt(mean_squared_error(predict_tree, test_y)))

295996.2936341859


In [67]:
bagging_predict_result = []

for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])]
    random_data_index = np.random.choice(data_index, train_x.shape[0])
    print(len(set(random_data_index))) # 데이터를 복원추출시, 전체 데이터의 약 63% 정도 추출
    
    sm_train_x = train_x.iloc[random_data_index, ]
    sm_train_y = train_y.iloc[random_data_index, ]

    decision_tree_model = DecisionTreeRegressor()
    tree_model = decision_tree_model.fit(sm_train_x, sm_train_y)
    predict_tree = tree_model.predict(test_x)

    bagging_predict_result.append(predict_tree)
    print(sqrt(mean_squared_error(predict_tree, test_y)))

9569
306936.4731088418
9500
283470.55077733874
9536
306124.8565905851
9586
296456.94224097213
9610
288542.1280224167
9573
313655.6138915013
9562
309425.07761432003
9643
295900.2364018849
9605
308934.52907982824
9625
284851.974184133


In [68]:
bagging_predict = []

for lst2_index in range(test_x.shape[0]):
    temp_predict = []

    for lst_index in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[lst_index][lst2_index])
    
    bagging_predict.append(np.mean(temp_predict))

In [69]:
sqrt(mean_squared_error(bagging_predict, test_y))

239320.27643264446

### Bagging을 이용하여 의사결정나무모형에 적합 후 평가(Sampling 10번)

In [75]:
bagging_model = BaggingRegressor(base_estimator=decision_tree_model, n_estimators=10)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x)

print(sqrt(mean_squared_error(predict2, test_y)))

235254.4358990768


In [77]:
bagging_model = BaggingRegressor(base_estimator=decision_tree_model, n_estimators=30)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x)

print(sqrt(mean_squared_error(predict2, test_y)))

232703.08875860774
