# 결정트리를 이용한 멜버른 집값 예측

In [1]:
!ls drive/MyDrive

 Classroom  'Colab Notebooks'   ml


In [2]:
!ls drive/MyDrive/data

ls: cannot access 'drive/MyDrive/data': No such file or directory


## 데이터 불러오기, 모델 학습 및 예측


In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, f1_score, precision_score

In [5]:
melbourne_file_path = 'drive/MyDrive/ml/data/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [6]:
y = melbourne_data.Price
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [7]:
melbourne_features = ['Rooms','Bathroom','Landsize','Lattitude','Longtitude']

In [8]:
X = melbourne_data[melbourne_features]
X

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.79960,144.99840
1,2,1.0,156.0,-37.80790,144.99340
2,3,2.0,134.0,-37.80930,144.99440
3,3,2.0,94.0,-37.79690,144.99690
4,4,1.0,120.0,-37.80720,144.99410
...,...,...,...,...,...
13575,4,2.0,652.0,-37.90562,145.16761
13576,3,2.0,333.0,-37.85927,144.87904
13577,3,2.0,436.0,-37.85274,144.88738
13578,4,1.0,866.0,-37.85908,144.89299


In [9]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941


In [10]:
melbourne_model = DecisionTreeRegressor(random_state=0)
melbourne_model.fit(X,y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [11]:
melbourne_model.predict(X.head())

array([1480000., 1035000., 1465000.,  850000., 1600000.])

In [12]:
melbourne_model.predict(X[:5]) - y[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Price, dtype: float64

In [13]:
melbourne_model.score(X,y)

0.9997390882943573

## 모델 '제대로' 평가하기

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(12222, 5) (12222,)
(1358, 5) (1358,)


In [15]:
melbourne_model.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [16]:
melbourne_model.predict(X_test.head())

array([ 715000., 2330000., 2600000., 1750000., 3100000.])

In [17]:
melbourne_model.predict(X_train[:5]) - y_train[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Price, dtype: float64

In [18]:
melbourne_model.predict(X_test)[:5] - y_test[:5]

12222     115000.0
12223     545000.0
12224     -50000.0
12225    -201000.0
12226    1150000.0
Name: Price, dtype: float64

In [19]:
melbourne_model.score(X_train, y_train)

0.9997104417677788

In [20]:
dt_r2 = melbourne_model.score(X_test, y_test)
dt_r2

0.37362922089637307

## Light GBM

In [21]:
#마이크로소프트에서 개발한 모델
import lightgbm

In [22]:
train_data = lightgbm.Dataset(X_train, label=y_train) #categorical_feature=categorical_features 
test_data = lightgbm.Dataset(X_test, label=y_test)

In [23]:
#파라미터 고려해야될게 너무 많다.
parameters = {
    'objective': 'regression',
    'metric': 'mse', #cost_function
    'boosting': 'gbdt', #그레디언트 부스팅 decision tree
    'num_leaves': 31, #잎의 갯수
    'learning_rate': 0.05, #학습률 높을수록 빠르지만, overfitting될 확률 높음
    'verbose_eval': 10,
    'metric_freq': 10,
}

In [24]:
lgbm_model = lightgbm.train(parameters,
                            train_data,
                            valid_sets=test_data,
                            num_boost_round=5000,
                            early_stopping_rounds=10)

[1]	valid_0's l2: 3.81902e+11
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l2: 3.61108e+11
[3]	valid_0's l2: 3.41968e+11
[4]	valid_0's l2: 3.24365e+11
[5]	valid_0's l2: 3.08223e+11
[6]	valid_0's l2: 2.92927e+11
[7]	valid_0's l2: 2.79067e+11
[8]	valid_0's l2: 2.66565e+11
[9]	valid_0's l2: 2.55089e+11
[10]	valid_0's l2: 2.43834e+11
[11]	valid_0's l2: 2.33261e+11
[12]	valid_0's l2: 2.23641e+11
[13]	valid_0's l2: 2.15173e+11
[14]	valid_0's l2: 2.07455e+11
[15]	valid_0's l2: 2.00131e+11
[16]	valid_0's l2: 1.93941e+11
[17]	valid_0's l2: 1.87718e+11
[18]	valid_0's l2: 1.81212e+11
[19]	valid_0's l2: 1.75605e+11
[20]	valid_0's l2: 1.70094e+11
[21]	valid_0's l2: 1.65779e+11
[22]	valid_0's l2: 1.61842e+11
[23]	valid_0's l2: 1.57709e+11
[24]	valid_0's l2: 1.5381e+11
[25]	valid_0's l2: 1.50354e+11
[26]	valid_0's l2: 1.4691e+11
[27]	valid_0's l2: 1.4411e+11
[28]	valid_0's l2: 1.41326e+11
[29]	valid_0's l2: 1.38398e+11
[30]	valid_0's l2: 1.35775e+11
[31]	valid_0's l2: 1

In [25]:
lgbm_model.predict(X_test[:5])

array([ 574103.94460752, 1959851.0966476 , 2310908.53231483,
       1804196.62121925, 2340710.72683632])

In [26]:
lgbm_model.predict(X_test)[:5] - y_test[:5]

12222    -25896.055392
12223    174851.096648
12224   -339091.467685
12225   -146803.378781
12226    390710.726836
Name: Price, dtype: float64

In [27]:
lgbm_r2 = r2_score(lgbm_model.predict(X_test), y_test)
print(dt_r2, lgbm_r2)

0.37362922089637307 0.7166618912536572


In [30]:
dt_mse = mean_squared_error(melbourne_model.predict(X_test), y_test, squared=False)
lgbm_mse = mean_squared_error(lgbm_model.predict(X_test), y_test, squared=False)
print(dt_mse, lgbm_mse)

495521.3859090252 294867.29611620615
