In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import math

In [2]:
df = pd.read_csv('./train.csv')

In [3]:
eval_df = pd.read_csv('./test.csv')

In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
numeric_column_names = [ 
    'LotFrontage', 'LotArea', 'PoolArea', 'MiscVal', 'OverallQual', 'OverallCond',
    'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
    'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
    'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'YrSold'
]

embedding_column_names = [ 
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 
    'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'BsmtFinType2', 'Heating',
    'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
    'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition', 'Foundation', 
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1'
]

feature_column_names = numeric_column_names + embedding_column_names
label_column_name = ['SalePrice']

In [6]:
train_features, validation_features, train_labels, validation_labels = train_test_split(
    df[feature_column_names], df[label_column_name], test_size=0.33)

In [7]:
eval_features = eval_df[feature_column_names].copy()

In [8]:
class NaRemover():
    def fit_transform(self, features, mapping):
        self.mean = features[numeric_column_names].mean()
        self.mapping = mapping
        return self.transform(features)
    
    def transform(self, features):
        for column, value in self.mapping.items():
            features[column] = features[column].fillna(value)
        
        for column in embedding_column_names:
            features[column] = features[column].fillna(features[column].value_counts().keys()[0])
        
        return features.fillna(self.mean)

In [9]:
class Normalizer():
    def fit_transform(self, features, mapping):
        self.naRemover = NaRemover()
        self.scaler = MinMaxScaler()
        
        features = self.naRemover.fit_transform(features, mapping)
        features[numeric_column_names] = self.scaler.fit_transform(features[numeric_column_names])
        return features
    
    def transform(self, features):
        features = self.naRemover.transform(features)
        features[numeric_column_names] = self.scaler.transform(features[numeric_column_names])
        return features

In [10]:
normalizer = Normalizer()

In [11]:
normalized_train_features = normalizer.fit_transform(train_features, { 
    'Alley': 'None',
    'BsmtQual': 'None',
    'BsmtCond': 'None',
    'BsmtExposure': 'None',
    'BsmtFinType1': 'None',
    'BsmtFinSF1': 0,
    'BsmtFinType2': 'None',
    'BsmtFinSF2': 0,
    'BsmtUnfSF': 0,
    'TotalBsmtSF': 0,
    'FireplaceQu': 'None',
    'GarageType': 'None',
    'GarageFinish': 'None',
    'GarageArea': 0,
    'GarageQual': 'None',
    'GarageCond': 'None',
    'PoolArea': 0,
    'PoolQC': 'None',
    'Fence': 'None',
    'MiscFeature': 'None'
})

In [12]:
normalized_validation_features = normalizer.transform(validation_features)

In [13]:
normalized_eval_features = normalizer.transform(eval_features)

In [14]:
normalized_train_features.head()

Unnamed: 0,LotFrontage,LotArea,PoolArea,MiscVal,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Fence,MiscFeature,MoSold,SaleType,SaleCondition,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1
794,0.168797,0.05835,0.0,0.032258,0.625,0.428571,0.884058,0.766667,0.0,0.0,...,,Shed,10,WD,Normal,PConc,Gd,TA,No,Unf
127,0.116438,0.018903,0.0,0.0,0.375,0.714286,0.42029,0.0,0.0,0.020553,...,,,6,WD,Normal,BrkTil,TA,TA,No,LwQ
919,0.226027,0.059556,0.0,0.0,0.5,0.857143,0.623188,0.866667,0.0,0.093551,...,,,5,WD,Normal,CBlock,Gd,TA,No,ALQ
52,0.304795,0.043903,0.0,0.0,0.375,0.428571,0.65942,0.216667,0.0,0.018427,...,,,5,WD,Normal,CBlock,Gd,TA,Gd,LwQ
348,0.05137,0.007027,0.0,0.0,0.625,0.428571,0.949275,0.9,0.06625,0.101524,...,,,10,WD,Normal,PConc,Gd,TA,No,GLQ


In [15]:
numeric_columns = [tf.feature_column.numeric_column(column) for column in numeric_column_names]

In [16]:
embedding_columns = []

for column in embedding_column_names:
    vocabulary_list = normalized_train_features[column].unique()
    dimension = math.ceil(len(vocabulary_list) ** 0.25)
    
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(
        column, vocabulary_list)
    
    embedding_columns.append(tf.feature_column.embedding_column(
        categorical_column, dimension))

In [17]:
feature_columns = numeric_columns + embedding_columns

In [18]:
batch_size = 256

In [19]:
def train_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), tf.cast(labels, tf.float32)))
    dataset = tf.data.Dataset.apply(dataset, tf.contrib.data.shuffle_and_repeat(1000))
    return dataset.batch(batch_size).prefetch(2)

In [20]:
def eval_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), tf.cast(labels, tf.float32)))
    return dataset.batch(batch_size).prefetch(2)

In [21]:
def predict_input_fn(features):
    dataset = tf.data.Dataset.from_tensor_slices(dict(features))
    return dataset.batch(batch_size).prefetch(2)

In [28]:
def model_fn(features, labels, mode, params):
    
    net = tf.feature_column.input_layer(features, params['feature_columns'])
    
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    
    predictions = tf.keras.layers.Dense(1)(net)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions)
    
    log_labels = tf.log(labels)
    log_predictions = tf.log(predictions)
    
    loss = tf.sqrt(tf.losses.mean_squared_error(log_labels, log_predictions))
    
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [29]:
classifier = tf.estimator.Estimator(
    model_fn, 
    model_dir='./model',
    params={ 'feature_columns': feature_columns })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbcc78292e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [34]:
for epoch in range(10):
    classifier.train(
        input_fn=lambda: train_input_fn(normalized_train_features, train_labels),
        steps=1000)
    
    classifier.evaluate(
        input_fn=lambda: eval_input_fn(normalized_validation_features, validation_labels))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2001 into ./model/model.ckpt.
INFO:tensorflow:loss = 0.13690935, step = 2000
INFO:tensorflow:global_step/sec: 10.3786
INFO:tensorflow:loss = 0.14955348, step = 2100 (9.640 sec)
INFO:tensorflow:global_step/sec: 18.463
INFO:tensorflow:loss = 0.13054636, step = 2200 (5.416 sec)
INFO:tensorflow:global_step/sec: 18.7027
INFO:tensorflow:loss = 0.13397594, step = 2300 (5.347 sec)
INFO:tensorflow:global_step/sec: 18.7031
INFO:tensorflow:loss = 0.12236265, step = 2400 (5.345 sec)
INFO:tensorflow:global_step/sec: 18.6423
INFO:tensorflow:loss = 0.14511068, step = 2500 (5.363 sec)
INFO:tensorflow:global_step/sec: 18.6367
INFO:tensorflow:loss = 0.12743637, step =

In [None]:
predictions = [prediction[0] for prediction in classifier.predict(input_fn=lambda: predict_input_fn(normalized_eval_features))]

INFO:tensorflow:Calling model_fn.


In [None]:
offset = 1461
submissions = pd.DataFrame({
    'Id': range(offset, len(predictions) + offset),
    'SalePrice': predictions
})

In [None]:
submissions.to_csv('submissions.csv', index=False)