In [41]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import math

In [2]:
df = pd.read_csv('./train.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
numeric_column_names = ['MSSubClass', 'LotFrontage', 'LotArea', 'PoolArea', 'MiscVal']
embedding_column_names = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'SaleCondition']
feature_column_names = numeric_column_names + embedding_column_names
label_column_name = ['SalePrice']

In [5]:
train_features, validation_features, train_labels, validation_labels = train_test_split(
    df[feature_column_names], df[label_column_name], test_size=0.33)

In [12]:
class NaRemover():
    def fit_transform(self, features, mapping):
        self.mean = features[numeric_column_names].mean()
        self.mapping = mapping
        return self.transform(features)
    
    def transform(self, features):
        for column, value in self.mapping.items():
            features[column] = features[column].fillna(value)
        
        return features.fillna(self.mean)

In [25]:
class Normalizer():
    def fit_transform(self, features, mapping):
        self.naRemover = NaRemover()
        self.scaler = MinMaxScaler()
        
        features = self.naRemover.fit_transform(features, mapping)
        features[numeric_column_names] = self.scaler.fit_transform(features[numeric_column_names])
        return features
    
    def transform(self, features):
        features = self.naRemover.transform(features)
        features[numeric_column_names] = self.scaler.transform(features[numeric_column_names])
        return features

In [26]:
normalizer = Normalizer()

In [27]:
normalized_train_features = normalizer.fit_transform(train_features, { 'Alley': 'None' })

In [28]:
normalized_validation_features = normalizer.transform(validation_features)

In [31]:
normalized_train_features.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,PoolArea,MiscVal,MSZoning,Street,Alley,LotShape,LandContour,Utilities,SaleCondition
459,0.176471,0.168043,0.034984,0.0,0.0,RL,Pave,,IR1,Bnk,AllPub,Normal
814,0.176471,0.082192,0.042532,0.0,0.0,RL,Pave,Grvl,Reg,Lvl,AllPub,Normal
1118,0.352941,0.219178,0.076671,0.0,0.0,RL,Pave,,Reg,Lvl,AllPub,Normal
1147,0.294118,0.184932,0.0655,0.0,0.0,RL,Pave,,Reg,Bnk,AllPub,Normal
979,0.0,0.202055,0.046009,0.0,0.0,RL,Pave,,Reg,Lvl,AllPub,Normal


In [46]:
numeric_columns = [tf.feature_column.numeric_column(column) for column in numeric_column_names]

In [47]:
embedding_columns = []

for column in embedding_column_names:
    vocabulary_list = normalized_train_features[column].unique()
    dimension = math.ceil(len(vocabulary_list) ** 0.25)
    
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(
        column, vocabulary_list)
    
    embedding_columns.append(tf.feature_column.embedding_column(
        categorical_column, dimension))

In [48]:
feature_columns = numeric_columns + embedding_columns

In [49]:
batch_size = 32

In [86]:
def train_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    dataset = tf.data.Dataset.apply(dataset, tf.contrib.data.shuffle_and_repeat(1000))
    dataset = dataset.batch(batch_size)
    return dataset

In [106]:
def model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    predictions = tf.keras.layers.Dense(1)(net)
    
    loss = tf.sqrt(tf.losses.mean_squared_error(labels, predictions))
    
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [107]:
classifier = tf.estimator.Estimator(
    model_fn, 
    model_dir='./model',
    params={ 'feature_columns': feature_columns })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa55752b4a8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
classifier.train(
    input_fn=lambda: train_input_fn(normalized_train_features, train_labels),
    steps=1000)

INFO:tensorflow:Calling model_fn.
