In [182]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import sklearn

In [183]:
sns.set_theme()

## Load and preprocess data

In [184]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [185]:
print(train_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [186]:
train_data.shape

(1460, 81)

In [187]:
train_data = pd.read_csv('train.csv')

In [188]:
def preprocess(df : pd.DataFrame) -> pd.DataFrame:
    '''
    Process raw dataframe into one ready for a tensorflow tensor
    '''
    df.drop('Id', inplace=True, axis=1)
    static_cols = df.columns
    for c in static_cols:
        if pd.api.types.is_numeric_dtype(df[c]):
            fill_value = df[c].mean()
            df[c] = df[c].fillna(fill_value)
        elif pd.api.types.is_string_dtype(df[c]):
            one_hot = pd.get_dummies(df[c], prefix=c)
            df.drop(c, inplace=True, axis=1)
            df = df.join(one_hot)

        else:
            raise NotImplementedError('Failed to find contained value of this column')
    return df

In [189]:
train_data = pd.read_csv('train.csv')
train_data = preprocess(train_data)

In [190]:
train_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [191]:
bad_data = train_data[train_data.isna().any(axis=1)]

In [192]:
assert(bad_data.shape[0] == 0) # catch bad preprocess

In [193]:
from sklearn import preprocessing

## Convert to Tensorflow Dataset (and split validation set)

In [194]:

y_df_train = train_data['SalePrice']
x_df_train = train_data.drop('SalePrice', inplace=False, axis=1)
scaler = preprocessing.StandardScaler().fit(x_df_train)
x_df_train = scaler.transform(x_df_train)

In [195]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(x_df_train, y_df_train)

In [196]:
X_train , X_val, y_train, y_val = tf.convert_to_tensor(X_train), tf.convert_to_tensor(X_val), tf.convert_to_tensor(y_train), tf.convert_to_tensor(y_val)



In [197]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(10)

In [198]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(10)

## Create a model

In [218]:
def create_non_seq_model():
    input = tf.keras.Input(shape=[288,])
    hidden1 = tf.keras.layers.Dense(300, activation='relu')(input)
    hidden2 = tf.keras.layers.Dense(300, activation='relu')(hidden1)
    hidden3 = tf.keras.layers.Dense(300, activation='relu')(hidden2)
    hidden4 = tf.keras.layers.Dense(300, activation='relu')(hidden3)
    hidden5 = tf.keras.layers.Dense(150, activation='relu')(hidden4)
    hidden6 = tf.keras.layers.Dense(75, activation='relu')(hidden5)
    concat = tf.keras.layers.Concatenate()([input, hidden6])
    output = tf.keras.layers.Dense(1)(concat)
    model = tf.keras.Model(inputs=[input], outputs=[output])
    return model


In [219]:
from tensorflow.keras.losses import MeanSquaredError
loss_fn = MeanSquaredError()
model = create_non_seq_model()
model.compile(optimizer='adam', loss=loss_fn)

In [220]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 288)]        0           []                               
                                                                                                  
 dense_33 (Dense)               (None, 300)          86700       ['input_16[0][0]']               
                                                                                                  
 dense_34 (Dense)               (None, 300)          90300       ['dense_33[0][0]']               
                                                                                                  
 dense_35 (Dense)               (None, 300)          90300       ['dense_34[0][0]']               
                                                                                            

In [223]:
import datetime

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"./logs_{str(datetime.datetime.now()).replace(':','_')}")
model.fit(train_ds, validation_data=val_ds, epochs=200, callbacks=[tensorboard_callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x20a508f8040>

In [177]:
test_set = pd.read_csv('test.csv')
test_set = preprocess(test_set)

In [178]:
test_set.shape

(1459, 270)