In [1]:
from tensorflow import keras
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler




In [25]:
df = pd.read_csv("../ML/data/boston.csv")

In [26]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


- 보스턴 집값 데이터 독립변수
    - CRIM : 인구 1명당 범죄 발생 수
    - ZN : 25,000 평방 피트 이상의 주거 구역 비중
    - INDUS : 소매업 외 상업이 차지하는 면적 비율
    - CHAS : 찰스강 위치 변수(1 : 강 주변, 0 : 이외)
    - NOX : 일산화질소 농도
    - RM : 집의 평균 방 수 
    - AGE : 1940년 이전에 지어진 비율
    - DIS : 5가지 보스턴 시 고용 시설까지의 거리
    - RAD : 순환 고속도로의 접근 용이성
    - TAX : $10,000 당 부동산 세율 총계
    - PTRATIO : 지역별 학생과 교사 비율
    - B : 지역별 흑인 비율
    - LSTAT : 급여가 낮은 직업에 종사하는 인구 비율(%)
    
- 종속변수
    - PRICE : 가격(단위 : $1,000)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [28]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [29]:
df.shape

(506, 14)

In [30]:
# 데이터 분할
x = df.drop('PRICE', axis = 1)
y = df["PRICE"]

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,
                                    random_state = 12)

In [32]:
x_sub, x_val, y_sub, y_val = train_test_split(x_train, y_train, test_size = 0.2,
                                             random_state = 12)

In [33]:
x_sub.shape

(323, 13)

# 모델 설계

In [34]:
model = keras.Sequential()

In [35]:
# 은닉층1
model.add(keras.layers.Dense(30, activation = "relu", input_shape = (13,)))

In [36]:
# 은닉층2
model.add(keras.layers.Dense(6, activation = "relu"))

In [37]:
# 출력층
model.add(keras.layers.Dense(1))

In [38]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 16, restore_best_weights = True)

In [42]:
model.compile(optimizer = "adam", loss = "mean_squared_error", 
              metrics = ["mae"])  # ["mae"] = mean_absolute_error

In [43]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 30)                420       
                                                                 
 dense_4 (Dense)             (None, 6)                 186       
                                                                 
 dense_5 (Dense)             (None, 1)                 7         
                                                                 
Total params: 613 (2.39 KB)
Trainable params: 613 (2.39 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
history = model.fit(x_sub, y_sub, epochs = 200, validation_data = (x_val, y_val), callbacks = [early_stopping_cb])

Epoch 1/200


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200


Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200


Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200


# 모델 성능 평가

In [45]:
model.evaluate(x_test, y_test)



[41.422637939453125, 4.608356952667236]

In [48]:
model.predict(x_test.iloc[[0]])



array([[26.700815]], dtype=float32)

In [50]:
y_test.iloc[[0]]

334    20.7
Name: PRICE, dtype: float64

In [51]:
y_pred = model.predict(x_test).flatten()



In [52]:
for i in range(10):
    label = y_test.iloc[i]
    prediction = y_pred[i]
    print(f"실제가격 : {label: 3f}, 예상가격 : {prediction:.3f}")

실제가격 :  20.700000, 예상가격 : 26.701
실제가격 :  12.700000, 예상가격 : 22.743
실제가격 :  8.500000, 예상가격 : 8.207
실제가격 :  25.100000, 예상가격 : 27.009
실제가격 :  28.200000, 예상가격 : 34.280
실제가격 :  22.500000, 예상가격 : 22.751
실제가격 :  18.200000, 예상가격 : 24.005
실제가격 :  43.500000, 예상가격 : 32.649
실제가격 :  36.100000, 예상가격 : 32.116
실제가격 :  23.800000, 예상가격 : 26.533
