In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt 
%matplotlib inline
import tensorflow as tf
from tensorflow import keras 
import numpy as np
import sklearn
import pandas as pd
import sys, os, shutil
print(tf.__version__)
print(sys.version_info)

2.0.0
sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)


## Load Data

In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape) #(20640, 8)
print(housing.target.shape) #(20640,)

print(housing.data[:5])
print(housing.target[:5])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

## Train-Test split

In [3]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state=8)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=18)

## Dimensionality checking

In [4]:
print("training set size:")
print(x_train.shape, y_train.shape)
print("validation set size:")
print(x_valid.shape, y_valid.shape)
print("testing set size:")
print(x_test.shape, y_test.shape)

training set size:
(11610, 8) (11610,)
validation set size:
(3870, 8) (3870,)
testing set size:
(5160, 8) (5160,)


## Standardization

In [5]:
# https://scikit-learn.org/stable/modules/preprocessing.html
# x = (x-u)/std to make x with mean 0 and std 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
# use the mean and std of training set to do standardization for validation/testing set (Important!)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

## Build the model with keras.Sequential API

In [6]:
# build model
num_features = x_train_scaled.shape[1]
model = keras.models.Sequential()
model.add(keras.layers.Dense(30, activation="relu", input_shape=[num_features])) #fully connected layer 1
model.add(keras.layers.Dense(10, activation="relu")) #fully connected layer 2
model.add(keras.layers.Dense(1)) # output layer; no activation function here

## Compile the model


In [7]:
model.summary()
model.compile(loss="mean_squared_error",
				optimizer=keras.optimizers.SGD(0.01))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                270       
_________________________________________________________________
dense_1 (Dense)              (None, 10)                310       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 591
Trainable params: 591
Non-trainable params: 0
_________________________________________________________________


## callback functions

In [8]:
### add callback functions
logdir = "/Users/yantinghuang/Study/BIOS534_Yu_spring2019/log_dir_regression" # change to your preferred directory path
if os.path.exists(logdir):
	shutil.rmtree(logdir)
os.mkdir(logdir)
output_model_fn = os.path.join(logdir, "housing_price_model.h5")

callbacks = [
		keras.callbacks.TensorBoard(logdir),
		keras.callbacks.ModelCheckpoint(output_model_fn, save_best_only=True),
		keras.callbacks.EarlyStopping(patience=5, min_delta=1e-4)]

## Model Training

In [None]:
history = model.fit(x_train_scaled, y_train, epochs=100,
	validation_data=(x_valid_scaled, y_valid),
	callbacks=callbacks)
print(history.history)


## Plot training history

In [None]:
def plot_learning_curves(history):
	pd.DataFrame(history.history).plot(figsize=(8, 5))
	plt.grid(True)
	plt.gca().set_ylim(0, 3)
	plt.show()
plot_learning_curves(history)