In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense
import warnings


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
X = pd.concat([train.drop('SalePrice', 1), test])
# code to get test X
# X[X['Id'] > 1460]

In [4]:
def count_missing(data):
    null_cols = data.columns[data.isnull().any(axis=0)]
    X_null = data[null_cols].isnull().sum()
    X_null = X_null.sort_values(ascending=False)
    print(X_null)
# https://lstrln.wordpress.com/2017/10/25/predicting-house-prices-on-kaggle-a-gentle-introduction-to-data-science-part-ii/
count_missing(X)

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageFinish     159
GarageYrBlt      159
GarageQual       159
GarageCond       159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
Utilities          2
Functional         2
Exterior2nd        1
Exterior1st        1
SaleType           1
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
Electrical         1
KitchenQual        1
GarageCars         1
GarageArea         1
TotalBsmtSF        1
dtype: int64


In [5]:
# replace with none
none = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',
        'Fence', 'MiscFeature']

replace = X[none].fillna('None')
X = X.drop(none, axis=1)
X = pd.concat([X, replace], axis=1)
count_missing(X)

LotFrontage     486
GarageYrBlt     159
MasVnrType       24
MasVnrArea       23
MSZoning          4
Functional        2
Utilities         2
BsmtHalfBath      2
BsmtFullBath      2
BsmtFinSF1        1
Exterior1st       1
Exterior2nd       1
SaleType          1
BsmtFinSF2        1
GarageArea        1
TotalBsmtSF       1
Electrical        1
KitchenQual       1
GarageCars        1
BsmtUnfSF         1
dtype: int64


In [6]:
# replace continious na's with median
r_median = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'GarageArea',
            'TotalBsmtSF', 'BsmtUnfSF']

replace_med = X[r_median].fillna(X[r_median].median())
X = X.drop(r_median, axis=1)
X = pd.concat([X, replace_med], axis=1)
count_missing(X)

GarageYrBlt     159
MasVnrType       24
MSZoning          4
Functional        2
BsmtHalfBath      2
BsmtFullBath      2
Utilities         2
SaleType          1
GarageCars        1
KitchenQual       1
Electrical        1
Exterior2nd       1
Exterior1st       1
dtype: int64


In [7]:
# replace factored na's with mode
r_mode = ['MasVnrType', 'MSZoning', 'Functional', 'Utilities', 'BsmtFullBath',
          'BsmtHalfBath', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Electrical',
          'KitchenQual', 'GarageCars']
replace_mod = X[r_mode].fillna(X[r_mode].mode().iloc[0])
X = X.drop(r_mode, axis=1)
X = pd.concat([X, replace_mod], axis=1)
count_missing(X)

GarageYrBlt    159
dtype: int64


In [8]:
replace_gar = X['GarageYrBlt'].fillna(X.YearBuilt)
X = X.drop('GarageYrBlt', axis=1)
X = pd.concat([X, replace_gar], axis=1)
count_missing(X)

Series([], dtype: float64)


In [9]:
def dummy(df, col):
    replace = pd.get_dummies(df[col], prefix = col)
    df = pd.concat([df, replace],axis=1).drop([col], axis=1)
    return df

def data_eng(df, factor_columns):
    df_factor = df[factor_columns]
    df_factor = df_factor.apply(lambda x: pd.factorize(x)[0])
    df = df.drop(factor_columns, axis=1)
    df = pd.concat([df, df_factor], axis=1)
    return df

In [10]:
#pd.get_dummies(X['MSSubClass'], prefix='MS', drop_first=True)
dummies = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
           'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
           'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
           'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType1', 'Heating',
           'BsmtFinType2', 'CentralAir', 'Electrical', 'GarageType', 'GarageFinish', 'MiscFeature',
           'SaleType', 'SaleCondition']

ordinals = ['ExterCond', 'ExterQual',  'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'Functional', 'FireplaceQu', 'GarageQual', 'GarageCond',
            'PavedDrive', 'PoolQC', 'Fence']

In [11]:
for col in dummies:
    X = dummy(X, col)

In [12]:
X = data_eng(X, ordinals)
test_final = X[X['Id'] > 1460]
X_final = X[X['Id'] < 1461].drop(['Id'], axis=1)
y = np.log(train.SalePrice) 
y = pd.DataFrame(y)
scale = True
if scale:
    scaler = MinMaxScaler(feature_range=(0,1))
    warnings.filterwarnings("ignore")
    X_final = scaler.fit_transform(X_final)
    y = scaler.fit_transform(y)
    multiplied_by = scaler.scale_[0]
    added = scaler.min_[0]
train_X, val_X, train_y, val_y = train_test_split(X_final, y, random_state = 1)

In [13]:
model = Sequential()
model.add(Dense(300, activation = 'relu'))
model.add(Dense(500, activation = 'relu'))
model.add(Dense(300, activation = 'relu'))
model.add(Dense(1))
model.compile(loss = 'mean_squared_error', optimizer = 'adam')

In [14]:
model.fit(train_X, train_y, epochs = 275, shuffle = True, verbose = 0)

<tensorflow.python.keras.callbacks.History at 0x20f598087f0>

In [15]:
predictions = model.predict(val_X)

In [16]:
if scale:
    val_y -= added
    val_y /= multiplied_by
    predictions -= added
    predictions /= multiplied_by

In [17]:
print(np.sqrt(metrics.mean_squared_error(val_y, predictions)))
print(np.sqrt(metrics.mean_squared_error(np.exp(val_y), np.exp(predictions))))

0.13132213743255053
25372.743232464025
