# Boston Housing Dataset

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from IPython import display
import pandas as pd
import tensorflow as tf

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
tf.logging.set_verbosity(tf.logging.ERROR)

1.	CRIM - per capita crime rate by town
2.	ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
3.	INDUS - proportion of non-retail business acres per town.
4.	~~CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)~~
5.	NOX - nitric oxides concentration (parts per 10 million)
6.	RM - average number of rooms per dwelling
7.	AGE - proportion of owner-occupied units built prior to 1940
8.	DIS - weighted distances to five Boston employment centres
9.	~~RAD - index of accessibility to radial highways~~
10.	TAX - full-value property-tax rate per USD 10,000
11.	PTRATIO - pupil-teacher ratio by town
12.	~~B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town~~
13.	~~LSTAT - % lower status of the population~~
14.	MEDV - Median value of owner-occupied homes in USD 1000's



In [2]:
COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age", "dis", "tax", "ptratio", "medv"]

FEATURES = COLUMNS[:-1]
TARGET = COLUMNS[-1]

print("Features: {}".format(FEATURES))
print("Target: {}".format(TARGET))

Features: ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio']
Target: medv


In [0]:
def my_input_fn(data_set, num_epochs=None, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
        x=pd.DataFrame({f: data_set[f].values for f in FEATURES}),
        y=pd.Series(data_set[TARGET].values),
        num_epochs=num_epochs,
        shuffle=shuffle)

In [4]:
URL_PREFIX = "https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/input_fn/"

training_set = pd.read_csv(
    URL_PREFIX+"boston_train.csv", 
    skipinitialspace=True,
    skiprows=1, 
    names=COLUMNS)

test_set = pd.read_csv(
    URL_PREFIX+"boston_test.csv", 
    skipinitialspace=True,
    skiprows=1, 
    names=COLUMNS)

prediction_set = pd.read_csv(
    URL_PREFIX+"boston_predict.csv", 
    skipinitialspace=True,
    skiprows=1, 
    names=COLUMNS)

print("Training set summary:")
display.display(training_set.describe())

print("Test set summary:")
display.display(test_set.describe())

print("Prediction set summary:")
display.display(prediction_set.describe())

Training set summary:


Unnamed: 0,crim,zn,indus,nox,rm,age,dis,tax,ptratio,medv
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,3.9,11.3,11.4,0.6,6.3,69.1,3.8,411.7,18.5,22.6
std,9.4,23.5,7.0,0.1,0.7,28.5,2.1,171.6,2.2,9.6
min,0.0,0.0,0.5,0.4,3.6,2.9,1.1,187.0,12.6,5.0
25%,0.1,0.0,5.2,0.4,5.9,44.9,2.1,277.0,17.4,16.6
50%,0.3,0.0,9.7,0.5,6.2,78.8,3.2,332.0,19.1,21.4
75%,3.7,12.5,18.1,0.6,6.6,94.6,5.1,666.0,20.2,25.0
max,89.0,100.0,27.7,0.9,8.8,100.0,12.1,711.0,22.0,50.0


Test set summary:


Unnamed: 0,crim,zn,indus,nox,rm,age,dis,tax,ptratio,medv
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,2.4,10.8,10.3,0.5,6.3,66.9,3.9,395.1,18.3,22.1
std,4.6,22.3,6.2,0.1,0.6,26.5,2.1,154.1,2.2,7.5
min,0.0,0.0,1.5,0.4,5.0,8.4,1.4,188.0,12.6,7.4
25%,0.1,0.0,5.9,0.5,5.9,48.0,2.2,286.2,16.8,18.3
50%,0.2,0.0,8.6,0.5,6.2,71.6,3.2,330.0,18.6,20.9
75%,2.2,12.5,18.1,0.6,6.6,89.9,5.5,432.0,20.2,24.4
max,22.6,82.5,25.6,0.9,8.7,100.0,10.6,666.0,22.0,50.0


Prediction set summary:


Unnamed: 0,crim,zn,indus,nox,rm,age,dis,tax,ptratio,medv
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.0
mean,2.3,22.2,9.4,0.6,6.4,60.4,4.3,399.0,19.2,
std,3.5,29.6,7.3,0.1,0.7,32.4,2.2,208.7,0.9,
min,0.0,0.0,2.2,0.4,5.4,15.8,1.7,222.0,18.3,
25%,0.1,0.0,3.5,0.5,5.9,41.7,2.8,260.0,18.4,
50%,0.2,12.5,7.5,0.5,6.5,57.5,4.0,294.0,19.0,
75%,3.9,31.0,16.1,0.7,7.0,86.8,5.1,575.5,20.1,
max,8.2,75.0,18.1,0.7,7.2,98.9,8.0,666.0,20.2,


In [0]:
my_feature_columns = [tf.feature_column.numeric_column(f) for f in FEATURES]

In [0]:
regressor = tf.estimator.DNNRegressor(
    feature_columns=my_feature_columns,
    hidden_units=[10, 10],
    model_dir="/tmp/boston_housing")

In [7]:
regressor.train(input_fn=my_input_fn(training_set), steps=5000)

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f75aebc88d0>

In [8]:
evaluation = regressor.evaluate(input_fn=my_input_fn(test_set, num_epochs=1, shuffle=False))

print("Evaluation: {}".format(evaluation))
print("Loss: {0:f}".format(evaluation["loss"]))

Evaluation: {'average_loss': 12.271619, 'global_step': 5000, 'loss': 1227.1619}
Loss: 1227.161865


In [9]:
predictions = regressor.predict(input_fn=my_input_fn(prediction_set, num_epochs=1, shuffle=False))

for prediction in predictions:
    print("Prediction: {}".format(prediction))

Prediction: {'predictions': array([36.164463], dtype=float32)}
Prediction: {'predictions': array([18.884136], dtype=float32)}
Prediction: {'predictions': array([22.6753], dtype=float32)}
Prediction: {'predictions': array([36.28027], dtype=float32)}
Prediction: {'predictions': array([15.057931], dtype=float32)}
Prediction: {'predictions': array([18.617287], dtype=float32)}
