In [1]:
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/house_train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df.isnull().sum().sort_values(ascending=False).head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Id                 0
dtype: int64

In [5]:
df['LotShape']

0       Reg
1       Reg
2       IR1
3       IR1
4       IR1
       ... 
1455    Reg
1456    Reg
1457    Reg
1458    Reg
1459    Reg
Name: LotShape, Length: 1460, dtype: object

In [6]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
df = pd.get_dummies(df, dtype=int)
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,True,False,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,True,False,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,True,False,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,True,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,True,False,False,False,False,True,False


In [9]:
df = df.fillna(df.mean())

In [10]:
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,True,False,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,True,False,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,True,False,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,True,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,False,False,False,True,False,False,False,False,True,False
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,False,False,False,True,False,False,False,False,True,False
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,False,False,False,True,False,False,False,False,True,False
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,False,False,False,True,False,False,False,False,True,False


In [11]:
df_corr = df.corr()
df_corr.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,1.0,0.011156,-0.009601,-0.033226,-0.028365,0.012609,-0.012713,-0.021998,-0.050199,-0.005024,...,-0.020738,-0.018998,0.03892,0.026133,0.007009,-0.034852,-0.009018,0.004865,0.015881,-0.020738
MSSubClass,0.011156,1.0,-0.357056,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.022895,-0.069836,...,0.014005,-0.045156,-0.014555,0.026359,0.005003,0.016241,0.030002,0.000983,0.024359,-0.051068
LotFrontage,-0.009601,-0.357056,1.0,0.306795,0.234196,-0.05282,0.117598,0.082746,0.179283,0.215828,...,-0.051787,0.12658,-0.023461,-0.089928,-0.021846,-0.03702,-0.01809,0.015818,-0.072074,0.124842
LotArea,-0.033226,-0.139781,0.306795,1.0,0.105806,-0.005636,0.014228,0.013788,0.10396,0.214103,...,-0.01504,0.020039,-0.005722,-0.002292,-0.029126,-0.013208,0.008966,-0.010781,0.005711,0.022635
OverallQual,-0.028365,0.032628,0.234196,0.105806,1.0,-0.091932,0.572323,0.550684,0.410238,0.239666,...,-0.021172,0.327412,-0.057962,-0.225013,-0.103535,-0.041677,-0.04495,-0.025515,-0.143282,0.323295


In [12]:
df_corr_sort = df_corr.sort_values('SalePrice', ascending=False)
df_corr_sort['SalePrice'].head(10)

SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
BsmtQual_Ex     0.553105
TotRmsAbvGrd    0.533723
Name: SalePrice, dtype: float64

In [13]:
cols_train = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
              'TotalBsmtSF', '1stFlrSF']
X_train_pre = df[cols_train]
y = df['SalePrice'].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_train_pre, y, test_size=0.2)

In [16]:
X_train.shape

(1168, 6)

In [17]:
model = models.Sequential()
model.add(layers.Dense(10, input_dim=X_train.shape[1], activation='relu'))
model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dense(40, activation='relu'))
model.add(layers.Dense(1))

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                70        
                                                                 
 dense_1 (Dense)             (None, 30)                330       
                                                                 
 dense_2 (Dense)             (None, 40)                1240      
                                                                 
 dense_3 (Dense)             (None, 1)                 41        
                                                                 
Total params: 1,681
Trainable params: 1,681
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='mean_squared_error', optimizer='adam')
early_stopping = EarlyStopping(monitor='val_loss', patience=20)
modelpath = 'data/house.hdf5'
checkpoint = ModelCheckpoint(filepath=modelpath, monitor='val_loss',
                            verbose=0, save_best_only=True)

In [20]:
h = model.fit(X_train, y_train, validation_split=0.25, epochs=2000,
             batch_size=32, callbacks=[early_stopping, checkpoint])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000


Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
Epoch 73/2000
Epoch 74/2000
Epoch 75/2000
Epoch 76/2000
Epoch 77/2000
Epoch 78/2000
Epoch 79/2000
Epoch 80/2000
Epoch 81/2000
Epoch 82/2000
Epoch 83/2000
Epoch 84/2000
Epoch 85/2000
Epoch 86/2000
Epoch 87/2000
Epoch 88/2000
Epoch 89/2000
Epoch 90/2000
Epoch 91/2000
Epoch 92/2000
Epoch 93/2000
Epoch 94/2000
Epoch 95/2000
Epoch 96/2000
Epoch 97/2000
Epoch 98/2000
Epoch 99/2000
Epoch 100/2000
Epoch 101/2000
Epoch 102/2000
Epoch 103/2000
Epoch 104/2000
Epoch 105/2000
Epoch 106/2000
Epoch 107/2000
Epoch 108/2000
Epoch 109/2000
Epoch 110/2000
Epoch 111/2000
Epoch 112/2000
Epoch 113/2000
Epoch 114/2000
Epoch 115/2000
Epoch 116/2000
Epoch 117/2000
Epoch 118/2000
Epoch 119/2000
Epoch 120/2000
Epoch 121/2000
Epoch 122/2000
Epoch 123/2000
Epoch 124/2000
Epoch 125/2000
Epoch 126/2000
Epoch 127/2000
Epoch 128/2000
Epoch 129/2000
Epoch 130/2000
Epoch 131/2000
Epoch 132/2000
Epoch 133/2000
Epoch 134/2000
Epoch 135/2000
Epoch 136/2000
Epoch 137/2000
Epoch 138/

Epoch 139/2000
Epoch 140/2000
Epoch 141/2000
Epoch 142/2000
Epoch 143/2000


In [21]:
real_price = []
pred_price = []
X_num = []

n_iter = 0
y_prediction = model.predict(X_test).flatten()
for i in range(50):
    real = y_test[i]
    prediction = y_prediction[i]
    print('실제가격: {:.2f}, 예상가격 : {:.2f}'.format(real, prediction))
    real_price.append(real)
    pred_price.append(prediction)
    n_iter += 1
    X_num.append(n_iter)

실제가격: 160000.00, 예상가격 : 179491.72
실제가격: 106000.00, 예상가격 : 203423.11
실제가격: 129000.00, 예상가격 : 159517.89
실제가격: 230000.00, 예상가격 : 226659.81
실제가격: 91000.00, 예상가격 : 108357.80
실제가격: 325000.00, 예상가격 : 304555.03
실제가격: 236000.00, 예상가격 : 234085.75
실제가격: 196000.00, 예상가격 : 224508.45
실제가격: 191000.00, 예상가격 : 241429.66
실제가격: 227680.00, 예상가격 : 196138.03
실제가격: 161000.00, 예상가격 : 149537.67
실제가격: 166000.00, 예상가격 : 184638.69
실제가격: 82000.00, 예상가격 : 107236.16
실제가격: 34900.00, 예상가격 : 75341.64
실제가격: 212000.00, 예상가격 : 188314.41
실제가격: 320000.00, 예상가격 : 266537.12
실제가격: 96500.00, 예상가격 : 95672.59
실제가격: 122000.00, 예상가격 : 139031.81
실제가격: 285000.00, 예상가격 : 259160.38
실제가격: 160000.00, 예상가격 : 232768.61
실제가격: 138500.00, 예상가격 : 134575.20
실제가격: 204000.00, 예상가격 : 186496.83
실제가격: 277000.00, 예상가격 : 220284.98
실제가격: 197900.00, 예상가격 : 202819.72
실제가격: 210000.00, 예상가격 : 233333.95
실제가격: 175000.00, 예상가격 : 163810.91
실제가격: 174000.00, 예상가격 : 160972.09
실제가격: 108000.00, 예상가격 : 104157.64
실제가격: 119000.00, 예상가격 : 122942.65
실제가격: 169500.00, 예상가