In [41]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [42]:
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
13421,-121.9,37.3,39.0,1030.0,191.0,537.0,175.0,3.9,236.9
11222,-121.1,37.7,31.0,906.0,146.0,383.0,129.0,3.4,196.9
15142,-122.3,37.8,31.0,4596.0,1331.0,2048.0,1180.0,2.8,183.8
4417,-118.0,34.1,24.0,2343.0,834.0,3537.0,824.0,2.1,135.2
11425,-121.2,37.8,16.0,2085.0,342.0,1236.0,345.0,5.6,149.3
...,...,...,...,...,...,...,...,...,...
4251,-118.0,33.8,24.0,2578.0,580.0,1217.0,529.0,2.2,212.5
8827,-118.7,34.2,10.0,3663.0,409.0,1179.0,371.0,12.5,500.0
6831,-118.3,34.0,40.0,1695.0,374.0,1138.0,357.0,2.7,150.0
15356,-122.3,37.9,48.0,2365.0,490.0,1034.0,475.0,3.1,229.2


In [43]:
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


## Build the first model

In [44]:
my_feature = california_housing_dataframe[['total_rooms']]
my_feature

Unnamed: 0,total_rooms
13421,1030.0
11222,906.0
15142,4596.0
4417,2343.0
11425,2085.0
...,...
4251,2578.0
8827,3663.0
6831,1695.0
15356,2365.0


In [45]:
feature_columns = [tf.feature_column.numeric_column('total_rooms')]
feature_columns

[_NumericColumn(key='total_rooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

## Define the Target

In [46]:
targets = california_housing_dataframe['median_house_value']
targets

13421   236.9
11222   196.9
15142   183.8
4417    135.2
11425   149.3
         ... 
4251    212.5
8827    500.0
6831    150.0
15356   229.2
910     112.4
Name: median_house_value, Length: 17000, dtype: float64

## Configure the Regressor

In [47]:
# Use gradient descent as the optimizer
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

# Configure linear regression model
regressor = tf.estimator.LinearRegressor(feature_columns=feature_columns, optimizer=my_optimizer)
regressor

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x1aad9c62e10>

## Define input function

In [54]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
  
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

## Train the model

In [55]:
regressor.train(input_fn = lambda:my_input_fn(my_feature, targets), steps=100)

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x1aad9c62e10>

## Evaluate the model

In [64]:
# Create an input function for predictions.
# Note: Since we're making just one prediction for each example, we don't 
# need to repeat or shuffle the data here.
prediction_input_fn = lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)

# Call predict() on the linear_regressor to make predictions.
predictions = regressor.predict(input_fn=prediction_input_fn)

# Format predictions as a NumPy array, so we can calculate error metrics.
predictions = np.array([item['predictions'][0] for item in predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)

In [65]:
print("Mean Squared Error (on training data): {:0.3f}".format(mean_squared_error))
print("Root Mean Squared Error (on training data): {:0.3f}".format(root_mean_squared_error))

Mean Squared Error (on training data): 56251.029
Root Mean Squared Error (on training data): 237.173


Is this a good model? How would you judge how large this error is?

*Mean Squared Error (MSE)* can be hard to interpret, so we often look at Root Mean Squared Error (RMSE) instead. A nice property of RMSE is that it can be interpreted on the same scale as the original targets.

Let's compare the RMSE to the difference of the min and max of our targets:

In [67]:
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value

print(min_house_value)
print(max_house_value)
print(min_max_difference)

14.999
500.001
485.00199999999995


## Make calibration dataset

In [70]:
calibration_data = pd.DataFrame()
calibration_data['predictions'] = pd.Series(predictions)
calibration_data['targets'] = pd.Series(targets)

In [72]:
calibration_data.describe()

Unnamed: 0,predictions,targets
count,17000.0,17000.0
mean,0.4,207.3
std,0.3,116.0
min,0.0,15.0
25%,0.2,119.4
50%,0.3,180.4
75%,0.5,265.0
max,5.7,500.0
