reference by 
1. https://medium.com/coinmonks/linear-regression-with-tensorflow-canned-estimators-6cc4ffddd14f
2. https://hackernoon.com/build-your-first-neural-network-to-predict-house-prices-with-keras-3fb0839680f4

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from time import time, strftime, localtime

In [2]:
import math
import h5py
import matplotlib.pyplot as plt
import scipy
#from PIL import Image
from scipy import ndimage
import tensorflow as tf
from tensorflow.python.framework import ops
#from cnn_utils import *

%matplotlib inline

In [3]:
### Load data
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
submit_test_df = pd.read_csv('dataset/submit_test.csv')

In [82]:
### Process feature columns

# 32 feature (not include I, II, III...)

# [numeric]
building_material = tf.feature_column.numeric_column('building_material', dtype=tf.float64, shape=())
city = tf.feature_column.numeric_column('city', dtype=tf.float64, shape=())
txn_dt = tf.feature_column.numeric_column('txn_dt', dtype=tf.float64, shape=())
total_floor = tf.feature_column.numeric_column('total_floor', dtype=tf.float64, shape=())
building_type = tf.feature_column.numeric_column('building_type', dtype=tf.float64, shape=())
building_use = tf.feature_column.numeric_column('building_use', dtype=tf.float64, shape=())
building_complete_dt = tf.feature_column.numeric_column('building_complete_dt', dtype=tf.float64, shape=())
parking_way = tf.feature_column.numeric_column('parking_way', dtype=tf.float64, shape=())
# 8
parking_area = tf.feature_column.numeric_column('parking_area', dtype=tf.float64, shape=())
parking_price = tf.feature_column.numeric_column('parking_price', dtype=tf.float64, shape=())
txn_floor = tf.feature_column.numeric_column('txn_floor', dtype=tf.float64, shape=())
land_area = tf.feature_column.numeric_column('land_area', dtype=tf.float64, shape=())
building_area = tf.feature_column.numeric_column('building_area', dtype=tf.float64, shape=())
town = tf.feature_column.numeric_column('town', dtype=tf.float64, shape=())
lat = tf.feature_column.numeric_column('lat', dtype=tf.float64, shape=())
lon = tf.feature_column.numeric_column('lon', dtype=tf.float64, shape=())
# 16
village_income_median = tf.feature_column.numeric_column('village_income_median', dtype=tf.float64, shape=())
town_population = tf.feature_column.numeric_column('town_population', dtype=tf.float64, shape=())
town_area = tf.feature_column.numeric_column('town_area', dtype=tf.float64, shape=())
town_population_density = tf.feature_column.numeric_column('town_population_density', dtype=tf.float64, shape=())
doc_rate = tf.feature_column.numeric_column('doc_rate', dtype=tf.float64, shape=())
master_rate = tf.feature_column.numeric_column('master_rate', dtype=tf.float64, shape=())
bachelor_rate = tf.feature_column.numeric_column('bachelor_rate', dtype=tf.float64, shape=())
jobschool_rate = tf.feature_column.numeric_column('jobschool_rate', dtype=tf.float64, shape=())
# 24
highschool_rate = tf.feature_column.numeric_column('highschool_rate', dtype=tf.float64, shape=())
junior_rate = tf.feature_column.numeric_column('junior_rate', dtype=tf.float64, shape=())
elementary_rate = tf.feature_column.numeric_column('elementary_rate', dtype=tf.float64, shape=())
born_rate = tf.feature_column.numeric_column('born_rate', dtype=tf.float64, shape=())
death_rate = tf.feature_column.numeric_column('death_rate', dtype=tf.float64, shape=())
marriage_rate = tf.feature_column.numeric_column('marriage_rate', dtype=tf.float64, shape=())
divorce_rate = tf.feature_column.numeric_column('divorce_rate', dtype=tf.float64, shape=())
village = tf.feature_column.numeric_column('village', dtype=tf.float64, shape=())
# 32

# [nategorical column with hash bucket]
# [nategorical column with vocabulary list]
# [feature crosses]


#feature_cols = [building_material, city, txn_dt, total_floor, building_type, building_use, building_complete_dt, parking_way,
#                        parking_area, parking_price, txn_floor, land_area, building_area, town, lat, lon,
#                        village_income_median, town_population, town_area, town_population_density, doc_rate, master_rate, bachelor_rate, jobschool_rate,
#                        highschool_rate, junior_rate, elementary_rate, born_rate, death_rate, marriage_rate, divorce_rate, village]

#feature_names = ['building_material', 'city', 'txn_dt', 'total_floor', 'building_type', 'building_use', 'building_complete_dt', 'parking_way',
#                        'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'town', 'lat', 'lon',
#                        'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate',
#                        'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'village']

# remove "txn_dt", "building_complete_dt"
feature_cols = [building_material, city, total_floor, building_type, building_use, parking_way,
                        parking_area, parking_price, txn_floor, land_area, building_area, town, lat, lon,
                        village_income_median, town_population, town_area, town_population_density, doc_rate, master_rate, bachelor_rate, jobschool_rate,
                        highschool_rate, junior_rate, elementary_rate, born_rate, death_rate, marriage_rate, divorce_rate, village]
# remove "txn_dt", "building_complete_dt"
feature_names = ['building_material', 'city', 'total_floor', 'building_type', 'building_use', 'parking_way',
                        'parking_area', 'parking_price', 'txn_floor', 'land_area', 'building_area', 'town', 'lat', 'lon',
                        'village_income_median', 'town_population', 'town_area', 'town_population_density', 'doc_rate', 'master_rate', 'bachelor_rate', 'jobschool_rate',
                        'highschool_rate', 'junior_rate', 'elementary_rate', 'born_rate', 'death_rate', 'marriage_rate', 'divorce_rate', 'village']
label_name = 'total_price'

features_df = train_df[feature_names].fillna(value=0.0)
label_df = train_df[label_name].fillna(value=0.0)

# data preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
# preserve column name after scaler
mms_features_df = pd.DataFrame(min_max_scaler.fit_transform(features_df), index=features_df.index, columns=features_df.columns)

X_train, X_test, y_train, y_test = train_test_split(mms_features_df, label_df, random_state=0, test_size=0.2)

pred_ndarray = test_df[feature_names].fillna(value=0.0)

In [63]:
### Input function for training
def train_input():
    _dataset = tf.data.Dataset.from_tensor_slices(({'building_material': X_train['building_material'], 
                                                   'city': X_train['city'], 
                                                   'total_floor': X_train['total_floor'],
                                                   'building_type': X_train['building_type'],
                                                   'building_use': X_train['building_use'],
                                                   'parking_way': X_train['parking_way'], #6
                                                   'parking_area': X_train['parking_area'],
                                                   'parking_price': X_train['parking_price'],
                                                   'txn_floor': X_train['txn_floor'],
                                                   'land_area': X_train['land_area'],
                                                   'building_area': X_train['building_area'],
                                                   'town': X_train['town'],
                                                   'lat': X_train['lat'],
                                                   'lon': X_train['lon'], #14
                                                   'village_income_median': X_train['village_income_median'],
                                                   'town_population': X_train['town_population'],
                                                   'town_area': X_train['town_area'],
                                                   'town_population_density': X_train['town_population_density'],
                                                   'doc_rate': X_train['doc_rate'],
                                                   'master_rate': X_train['master_rate'],
                                                   'bachelor_rate': X_train['bachelor_rate'],
                                                   'jobschool_rate': X_train['jobschool_rate'], #22
                                                   'highschool_rate': X_train['highschool_rate'],
                                                   'junior_rate': X_train['junior_rate'],
                                                   'elementary_rate': X_train['elementary_rate'],
                                                   'born_rate': X_train['born_rate'],
                                                   'death_rate': X_train['death_rate'],
                                                   'marriage_rate': X_train['marriage_rate'],
                                                   'divorce_rate': X_train['divorce_rate'],
                                                   'village': X_train['village'], #30
                                                  }, y_train))
    dataset = _dataset.batch(32)
    # create an iterator
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

In [64]:
### Input function for validation
def val_input():
    # Convert the inputs to a Dataset.
    _dataset = tf.data.Dataset.from_tensor_slices(({'building_material': X_test['building_material'], 
                                                   'city': X_test['city'], 
                                                   'total_floor': X_test['total_floor'],
                                                   'building_type': X_test['building_type'],
                                                   'building_use': X_test['building_use'],
                                                   'parking_way': X_test['parking_way'], #6
                                                   'parking_area': X_test['parking_area'],
                                                   'parking_price': X_test['parking_price'],
                                                   'txn_floor': X_test['txn_floor'],
                                                   'land_area': X_test['land_area'],
                                                   'building_area': X_test['building_area'],
                                                   'town': X_test['town'],
                                                   'lat': X_test['lat'],
                                                   'lon': X_test['lon'], #14
                                                   'village_income_median': X_test['village_income_median'],
                                                   'town_population': X_test['town_population'],
                                                   'town_area': X_test['town_area'],
                                                   'town_population_density': X_test['town_population_density'],
                                                   'doc_rate': X_test['doc_rate'],
                                                   'master_rate': X_test['master_rate'],
                                                   'bachelor_rate': X_test['bachelor_rate'],
                                                   'jobschool_rate': X_test['jobschool_rate'], #22
                                                   'highschool_rate': X_test['highschool_rate'],
                                                   'junior_rate': X_test['junior_rate'],
                                                   'elementary_rate': X_test['elementary_rate'],
                                                   'born_rate': X_test['born_rate'],
                                                   'death_rate': X_test['death_rate'],
                                                   'marriage_rate': X_test['marriage_rate'],
                                                   'divorce_rate': X_test['divorce_rate'],
                                                   'village': X_test['village'], #30
                                                  }, y_test))
    dataset = _dataset.batch(32)
    # create an iterator
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    return features, labels

In [65]:
### Input function for prediction
def pred_input():
    _dataset = tf.data.Dataset.from_tensor_slices({'building_material': pred_ndarray['building_material'], 
                                                   'city': pred_ndarray['city'], 
                                                   'total_floor': pred_ndarray['total_floor'],
                                                   'building_type': pred_ndarray['building_type'],
                                                   'building_use': pred_ndarray['building_use'],
                                                   'parking_way': pred_ndarray['parking_way'], #6
                                                   'parking_area': pred_ndarray['parking_area'],
                                                   'parking_price': pred_ndarray['parking_price'],
                                                   'txn_floor': pred_ndarray['txn_floor'],
                                                   'land_area': pred_ndarray['land_area'],
                                                   'building_area': pred_ndarray['building_area'],
                                                   'town': pred_ndarray['town'],
                                                   'lat': pred_ndarray['lat'],
                                                   'lon': pred_ndarray['lon'], #14
                                                   'village_income_median': pred_ndarray['village_income_median'],
                                                   'town_population': pred_ndarray['town_population'],
                                                   'town_area': pred_ndarray['town_area'],
                                                   'town_population_density': pred_ndarray['town_population_density'],
                                                   'doc_rate': pred_ndarray['doc_rate'],
                                                   'master_rate': pred_ndarray['master_rate'],
                                                   'bachelor_rate': pred_ndarray['bachelor_rate'],
                                                   'jobschool_rate': pred_ndarray['jobschool_rate'], #22
                                                   'highschool_rate': pred_ndarray['highschool_rate'],
                                                   'junior_rate': pred_ndarray['junior_rate'],
                                                   'elementary_rate': pred_ndarray['elementary_rate'],
                                                   'born_rate': pred_ndarray['born_rate'],
                                                   'death_rate': pred_ndarray['death_rate'],
                                                   'marriage_rate': pred_ndarray['marriage_rate'],
                                                   'divorce_rate': pred_ndarray['divorce_rate'],
                                                   'village': pred_ndarray['village'], #30
                                                  })
    # Batch the examples
    dataset = _dataset.batch(32)
    
    return dataset

In [66]:
### Build Model

# Estimator using the default LinearRegressor.
estimator = tf.estimator.LinearRegressor(feature_columns=feature_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/0q/_btb7f211tx6vcv4w1mntbyc0000gn/T/tmp8cs0mrv5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a521e2ef0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [97]:
### Train Model
estimator.train(input_fn=train_input, steps=None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/0q/_btb7f211tx6vcv4w1mntbyc0000gn/T/tmp8cs0mrv5/model.ckpt-2400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2400 into /var/folders/0q/_btb7f211tx6vcv4w1mntbyc0000gn/T/tmp8cs0mrv5/model.ckpt.
INFO:tensorflow:loss = 4009799200000000.0, step = 2401
INFO:tensorflow:global_step/sec: 116.308
INFO:tensorflow:loss = 1.1450621e+16, step = 2501 (0.861 sec)
INFO:tensorflow:global_step/sec: 678.943
INFO:tensorflow:loss = 5081369000000000.0, step = 2601 (0.148 sec)
INFO:tensorflow:global_step/sec: 634.82
INFO:tensorflow:loss = 1.1440892e+16, step = 2701 (0.157 sec)
INFO:tensorflow:global_step/sec: 661.629
INFO:tensorflow:loss = 4.230721e+16, step = 2801 (0.151 sec)
INFO:tensorflow:global_step/sec: 612.216
INFO:tensor

<tensorflow_estimator.python.estimator.canned.linear.LinearRegressor at 0x1a521e2ac8>

In [98]:
### Evaluate Model
train_eval = estimator.evaluate(input_fn=train_input)
test_eval = estimator.evaluate(input_fn=val_input)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-22T14:42:34Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/0q/_btb7f211tx6vcv4w1mntbyc0000gn/T/tmp8cs0mrv5/model.ckpt-3900
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-05-22-14:42:38
INFO:tensorflow:Saving dict for global step 3900: average_loss = 3264690800000000.0, global_step = 3900, label/mean = 12891435.0, loss = 1.0447011e+17, prediction/mean = 250.62718
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3900: /var/folders/0q/_btb7f211tx6vcv4w1mntbyc0000gn/T/tmp8cs0mrv5/model.ckpt-3900
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-22T14:42:39Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/0q/_btb7f211tx6v

In [99]:
### Print the stats for the evaluation.
for key in sorted(test_eval):
    print("%s: %s" % (key, test_eval[key]))

average_loss: 3026607600000000.0
global_step: 3900
label/mean: 13120618.0
loss: 9.685144e+16
prediction/mean: 250.57552


In [100]:
### Prediction
preds = estimator.predict(input_fn=pred_input)
predictions = np.array([item['predictions'][0] for item in preds])

print ('predictions.shape:', predictions.shape)
print ('predictions:', predictions)

In [103]:
### Output answer
pred_df = pd.DataFrame(np.array(predictions), columns=['total_price'])
ans_df = pd.merge(submit_test_df['building_id'].to_frame(), pred_df, left_index=True, right_index=True, how='outer')
ans_df.to_csv("dataset/submit_test_5.csv",sep=',',index=False,encoding='UTF-8')