In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

tf.logging.set_verbosity(tf.logging.INFO)
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
def load_datasets():
    # reading the datasets
    train = pd.read_csv('../input/train.csv.gz', compression='infer')
    infer = pd.read_csv('../input/test.csv.gz', compression='infer')
    
    train_features = train.drop(['Id','SalePrice'], axis=1)
    train_labels = train['SalePrice']
    infer_features = infer.drop('Id', axis=1)
    
    from sklearn.model_selection import train_test_split
    X_train, X_eval, y_train, y_eval = train_test_split(train_features, train_labels, test_size=0.33, random_state=42)
    
    def get_bins(df):
        bins = {}
        numeric_columns = [
            'LotFrontage', 'LotArea', 'TotalBsmtSF', 'BsmtFinSF2', 'BsmtUnfSF', 'WoodDeckSF',
            'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
        
        for column in numeric_columns:
            tempdf, retbins = pd.cut(df[column], 10, labels=None, retbins=True, include_lowest=True)
            bins[column] = retbins

        return bins 
    
    bins = get_bins(X_train)
    
    labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
    
    return X_train, X_eval, y_train, y_eval, infer_features, bins

In [3]:
X_train, X_eval, y_train, y_eval, infer_features, bins = load_datasets()

In [4]:
from tensorflow.feature_column import *

all_columns = X_train.columns
numeric_columns = [
        'LotFrontage', 'LotArea', 'TotalBsmtSF', 'BsmtFinSF2', 'BsmtUnfSF', 'WoodDeckSF',
        'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea',
        'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
categorical_columns = set(all_columns) - set(numeric_columns)

BUCKETIZED_FEATURES = [bucketized_column(numeric_column(column), boundaries=bins[column].tolist()) for column in 
                      numeric_columns]

CATEGORICAL_FEATURES = [categorical_column_with_identity(key=column, num_buckets=150) for column in categorical_columns]

FEATURE_COLUMNS = []
for feature in BUCKETIZED_FEATURES:
    FEATURE_COLUMNS.append(feature)
    
for feature in CATEGORICAL_FEATURES:
    FEATURE_COLUMNS.append(indicator_column(feature))
    
for column in categorical_columns:
    X_train[column], junk = pd.factorize(X_train[column])
    X_eval[column], junk = pd.factorize(X_eval[column])
    infer_features[column], junk = pd.factorize(infer_features[column])

In [5]:
def train_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
                x = features,
                y = labels,
                num_epochs = 1000,
                shuffle = True,
                batch_size=50)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
                x = features,
                y = labels,
                num_epochs = 1,
                shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
                x = features,
                num_epochs = 1,
                shuffle = False)

In [6]:
def Optimizer(learning_rate=0.1):
    return tf.train.AdamOptimizer(learning_rate=learning_rate)

Adam = Optimizer()

In [7]:
OUTDIR = './DNN_REGRESSOR'

file_writer = tf.summary.FileWriter(OUTDIR)

estimator = tf.estimator.DNNRegressor(
    feature_columns=FEATURE_COLUMNS, 
    hidden_units=[128,128,128],
    model_dir=OUTDIR, 
    dropout=0.25,
    optimizer=Adam,
    batch_norm=True)

train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(X_train, y_train))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(X_eval, y_eval))

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_REGRESSOR', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001603427E320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling

INFO:tensorflow:global_step/sec: 44.0782
INFO:tensorflow:loss = 8773583000.0, step = 6801 (2.269 sec)
INFO:tensorflow:global_step/sec: 40.3037
INFO:tensorflow:loss = 32734677000.0, step = 6901 (2.483 sec)
INFO:tensorflow:global_step/sec: 39.8073
INFO:tensorflow:loss = 22676720000.0, step = 7001 (2.512 sec)
INFO:tensorflow:global_step/sec: 40.9145
INFO:tensorflow:loss = 19131556000.0, step = 7101 (2.443 sec)
INFO:tensorflow:global_step/sec: 39.8639
INFO:tensorflow:loss = 14222802000.0, step = 7201 (2.510 sec)
INFO:tensorflow:global_step/sec: 41.1975
INFO:tensorflow:loss = 9576225000.0, step = 7301 (2.441 sec)
INFO:tensorflow:global_step/sec: 41.4985
INFO:tensorflow:loss = 28157014000.0, step = 7401 (2.394 sec)
INFO:tensorflow:global_step/sec: 41.4372
INFO:tensorflow:loss = 10378382000.0, step = 7501 (2.413 sec)
INFO:tensorflow:global_step/sec: 41.461
INFO:tensorflow:loss = 8821180000.0, step = 7601 (2.412 sec)
INFO:tensorflow:global_step/sec: 41.7669
INFO:tensorflow:loss = 52584284000.0

INFO:tensorflow:global_step/sec: 38.3866
INFO:tensorflow:loss = 9723156000.0, step = 14801 (2.591 sec)
INFO:tensorflow:global_step/sec: 38.9127
INFO:tensorflow:loss = 28014404000.0, step = 14901 (2.568 sec)
INFO:tensorflow:global_step/sec: 41.0444
INFO:tensorflow:loss = 7891641000.0, step = 15001 (2.436 sec)
INFO:tensorflow:global_step/sec: 41.3696
INFO:tensorflow:loss = 16749250000.0, step = 15101 (2.417 sec)
INFO:tensorflow:global_step/sec: 40.2657
INFO:tensorflow:loss = 10063517000.0, step = 15201 (2.484 sec)
INFO:tensorflow:global_step/sec: 41.337
INFO:tensorflow:loss = 20948800000.0, step = 15301 (2.419 sec)
INFO:tensorflow:global_step/sec: 40.7377
INFO:tensorflow:loss = 10236188000.0, step = 15401 (2.455 sec)
INFO:tensorflow:global_step/sec: 41.6338
INFO:tensorflow:loss = 7372962000.0, step = 15501 (2.402 sec)
INFO:tensorflow:global_step/sec: 40.6434
INFO:tensorflow:loss = 19881918000.0, step = 15601 (2.460 sec)
INFO:tensorflow:global_step/sec: 40.0332
INFO:tensorflow:loss = 1974

({'average_loss': 3100744400.0,
  'label/mean': 180666.03,
  'loss': 373639700000.0,
  'prediction/mean': 174940.31,
  'global_step': 19560},
 [])

In [8]:
generator = estimator.predict(input_fn=pred_input_fn(X_eval))
predictions = [next(generator) for i in range(len(X_eval))]
values = [val['predictions'].tolist()[0] for val in predictions]

from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error(y_eval, values))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_REGRESSOR\model.ckpt-19560
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


0.29222283727109016

In [None]:
generator = estimator.predict(input_fn=pred_input_fn(infer_features))
predictions = [next(generator) for i in range(len(infer_features))]
values = [val['predictions'].tolist()[0] for val in predictions]

In [None]:
ids = pd.read_csv('../input/test.csv.gz', compression='infer')['Id']

submission = pd.DataFrame()
submission['SalePrice'] = values
submission['Id'] = ids
submission.set_index('Id', inplace=True)

submission.to_csv('../output/05.dnn_regressor[128,64]_trained_r3.csv')