In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import math

In [2]:
df = pd.read_csv('./train.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
numeric_column_names = ['MSSubClass', 'LotFrontage', 'LotArea', 'PoolArea', 'MiscVal']
embedding_column_names = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'SaleCondition']
feature_column_names = numeric_column_names + embedding_column_names
label_column_name = ['SalePrice']

In [5]:
train_features, validation_features, train_labels, validation_labels = train_test_split(
    df[feature_column_names], df[label_column_name], test_size=0.33)

In [6]:
class NaRemover():
    def fit_transform(self, features, mapping):
        self.mean = features[numeric_column_names].mean()
        self.mapping = mapping
        return self.transform(features)
    
    def transform(self, features):
        for column, value in self.mapping.items():
            features[column] = features[column].fillna(value)
        
        return features.fillna(self.mean)

In [7]:
class Normalizer():
    def fit_transform(self, features, mapping):
        self.naRemover = NaRemover()
        self.scaler = MinMaxScaler()
        
        features = self.naRemover.fit_transform(features, mapping)
        features[numeric_column_names] = self.scaler.fit_transform(features[numeric_column_names])
        return features
    
    def transform(self, features):
        features = self.naRemover.transform(features)
        features[numeric_column_names] = self.scaler.transform(features[numeric_column_names])
        return features

In [8]:
normalizer = Normalizer()

In [65]:
normalized_train_features = normalizer.fit_transform(train_features, { 
    'Alley': 'None',
    'MSZoning': 'RL',
    'Utilities': 'AllPub'
})

In [66]:
normalized_validation_features = normalizer.transform(validation_features)

In [11]:
normalized_train_features.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,PoolArea,MiscVal,MSZoning,Street,Alley,LotShape,LandContour,Utilities,SaleCondition
170,0.176471,0.170317,0.050901,0.0,0.0,RM,Pave,,IR1,Lvl,AllPub,Normal
655,0.823529,0.0,0.00095,0.0,0.0,RM,Pave,,Reg,Lvl,AllPub,Family
900,0.0,0.170317,0.027427,0.0,0.0,RL,Pave,,IR1,Lvl,AllPub,Normal
1073,0.235294,0.184932,0.03028,0.0,0.0,RL,Pave,,IR1,Bnk,AllPub,Normal
1348,0.0,0.170317,0.068855,0.0,0.0,RL,Pave,,IR3,Low,AllPub,Normal


In [12]:
numeric_columns = [tf.feature_column.numeric_column(column) for column in numeric_column_names]

In [13]:
embedding_columns = []

for column in embedding_column_names:
    vocabulary_list = normalized_train_features[column].unique()
    dimension = math.ceil(len(vocabulary_list) ** 0.25)
    
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(
        column, vocabulary_list)
    
    embedding_columns.append(tf.feature_column.embedding_column(
        categorical_column, dimension))

In [14]:
feature_columns = numeric_columns + embedding_columns

In [15]:
batch_size = 128

In [16]:
def train_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), tf.cast(labels, tf.float32)))
    dataset = tf.data.Dataset.apply(dataset, tf.contrib.data.shuffle_and_repeat(1000))
    return dataset.batch(batch_size).prefetch(2)

In [17]:
def eval_input_fn(features, labels):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), tf.cast(labels, tf.float32)))
    return dataset.batch(batch_size).prefetch(2)

In [28]:
def predict_input_fn(features):
    dataset = tf.data.Dataset.from_tensor_slices(dict(features))
    return dataset.batch(batch_size).prefetch(2)

In [80]:
def model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.BatchNormalization()(net)
    predictions = tf.keras.layers.Dense(1)(net)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions)
    
    log_labels = tf.log(labels)
    log_predictions = tf.log(predictions)
    
    loss = tf.sqrt(tf.losses.mean_squared_error(log_labels, log_predictions))
    
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [81]:
classifier = tf.estimator.Estimator(
    model_fn, 
    model_dir='./model',
    params={ 'feature_columns': feature_columns })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7096f76898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [82]:
for epoch in range(10):
    classifier.train(
        input_fn=lambda: train_input_fn(normalized_train_features, train_labels),
        steps=1000)
    
    classifier.evaluate(
        input_fn=lambda: eval_input_fn(normalized_validation_features, validation_labels))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-13100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 13101 into ./model/model.ckpt.
INFO:tensorflow:loss = 0.3213071, step = 13100
INFO:tensorflow:global_step/sec: 44.2851
INFO:tensorflow:loss = 0.3079804, step = 13200 (2.259 sec)
INFO:tensorflow:global_step/sec: 75.4034
INFO:tensorflow:loss = 0.33040375, step = 13300 (1.328 sec)
INFO:tensorflow:global_step/sec: 77.0751
INFO:tensorflow:loss = 0.25584647, step = 13400 (1.297 sec)


KeyboardInterrupt: 

In [83]:
eval_df = pd.read_csv('./test.csv')

In [84]:
normalized_eval_features = normalizer.transform(eval_df[feature_column_names])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [99]:
predictions = [prediction[0] for prediction in classifier.predict(input_fn=lambda: predict_input_fn(normalized_eval_features))]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-13101
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [107]:
offset = 1461
submissions = pd.DataFrame({
    'Id': range(offset, len(predictions) + offset),
    'SalePrice': predictions
})

In [110]:
submissions.head()

Unnamed: 0,Id,SalePrice
0,1461,137941.40625
1,1462,74943.125
2,1463,190459.46875
3,1464,189439.0
4,1465,210601.0


In [111]:
submissions.to_csv('submissions.csv', index=False)