#### Goals
Use the LinearRegressor class in TensorFlow to predict median housing price and improve the model accuracy using hyperparameter tuning

#### Reference
Coursera - Machine Learning with TensorFlow on Google Cloud Platform notebook: machine_learning/deepdive/05_artandscience/a_handtuning.ipynb

In [4]:
import math
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf

In [5]:
print(tf.__version__)
tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

1.11.0


In [6]:
# Read dataser
df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=",")

In [7]:
# Add a feature for number of rooms
df['num_rooms'] = df['total_rooms'] / df['households']
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,num_rooms
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207300.9,5.4
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,115983.8,2.5
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,14999.0,0.8
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119400.0,4.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180400.0,5.2
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265000.0,6.1
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500001.0,141.9


In [8]:
# Split into train and eval
np.random.seed(seed=1) #makes split reproducible
msk = np.random.rand(len(df)) < 0.8
traindf = df[msk]
evaldf = df[~msk]

In [9]:
SCALE = 100000
OUTDIR = './housing_trained'

# RMSE evaluation metric
def rmse(labels, predictions):
    pred_values = tf.cast(predictions['predictions'],tf.float64)
    return {'rmse': tf.metrics.root_mean_squared_error(labels*SCALE, pred_values*SCALE)}

def train_and_evaluate(output_dir, num_train_steps = 100, learning_rate = 0.2, batch_size = 512):
    myopt = tf.train.FtrlOptimizer(learning_rate = learning_rate)
    estimator = tf.estimator.LinearRegressor(
        model_dir = output_dir, 
        feature_columns = [tf.feature_column.numeric_column('num_rooms')],
        optimizer = myopt)
    
    estimator = tf.contrib.estimator.add_metrics(estimator,rmse)
  
    train_spec=tf.estimator.TrainSpec(
        input_fn = tf.estimator.inputs.pandas_input_fn(x = traindf[["num_rooms"]],
            y = traindf["median_house_value"] / SCALE,
            num_epochs = None,
            batch_size = batch_size, # note the batch size
            shuffle = True),
        max_steps = num_train_steps)
    eval_spec=tf.estimator.EvalSpec(
        input_fn = tf.estimator.inputs.pandas_input_fn(x = evaldf[["num_rooms"]],
            y = evaldf["median_house_value"] / SCALE,
            num_epochs = 1,
            shuffle = False),
        steps = None,
        start_delay_secs = 1,
        throttle_secs = 10)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


In [10]:
# Train 
shutil.rmtree(OUTDIR, ignore_errors = True)
train_and_evaluate(OUTDIR, num_train_steps = 100, learning_rate = 0.2, batch_size = 100) 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './housing_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x103d41208>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using config: {'_model_dir': './housing_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps

In [11]:
# Train with more steps and slower learning rate  
shutil.rmtree(OUTDIR, ignore_errors = True)
train_and_evaluate(OUTDIR, num_train_steps = 300, learning_rate = 0.1, batch_size = 100) 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './housing_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb2ec8dbe0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using config: {'_model_dir': './housing_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps

In [14]:
# Instead of handtuning, calculate the appropriate number of steps
batch_size = 100
learning_rate = 0.1
num_steps = (len(traindf) / batch_size) / learning_rate 
shutil.rmtree(OUTDIR, ignore_errors = True)
train_and_evaluate(OUTDIR, num_steps, learning_rate, batch_size) 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './housing_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb313e8550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using config: {'_model_dir': './housing_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps

#### Results
Model 2: num_train_steps = 300, learning_rate = 0.2, batch_size = 100 has RMSE of 152387.45

Model 2: num_train_steps = 300, learning_rate = 0.1, batch_size = 100 has RMSE of 120531.75

Model 3: num_train_steps = 1362, learning_rate = 0.1, batch_size = 100 has RMSE of 112952.53 <-- Most accurate