# NN with TensorFlow2

**Objectives:**
  - Define a neural network (NN) and its hidden layers using the TensorFlow DNNRegressor class
  - Train a neural network to learn nonlinearities in a dataset and achieve better performance than a linear regression model

The [data](https://developers.google.com/machine-learning/crash-course/california-housing-data-description) is based on 1990 census data from California.

## Setup
In this first cell, we'll load the necessary libraries.

In [1]:
import math
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
if tf.__version__[0] == "1":
    !pip install tensorflow==2.0.0-alpha0
print(tf.__version__)

2.0.0-alpha0


In [0]:
data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

In [3]:
data.shape

(17000, 9)

In [4]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [5]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Preprocess

In [0]:
data["median_house_value"] /= 1000.0
data["rooms_per_person"] = data["total_rooms"] / data["population"]

In [7]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66.9,5.529064
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80.1,6.775908
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85.7,2.162162
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73.4,2.914563
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65.5,2.330128


## Split dataset

In [8]:
data = data.sample(frac=1).reset_index(drop=True)
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
0,-116.95,32.73,17.0,1635.0,272.0,960.0,279.0,5.2671,157.1,1.703125
1,-121.87,37.23,19.0,7357.0,963.0,3018.0,981.0,6.9473,361.4,2.437707
2,-118.31,33.92,35.0,1307.0,246.0,672.0,219.0,4.8456,146.4,1.94494
3,-118.19,33.79,43.0,1823.0,600.0,2339.0,560.0,1.6792,130.6,0.779393
4,-119.44,36.58,37.0,1054.0,239.0,879.0,257.0,2.5234,63.5,1.19909


In [9]:
train_df = data.loc[:12000, :]
train_df.shape

(12001, 10)

In [10]:
validation_df = data.loc[12000:, :]
validation_df.shape

(5000, 10)

In [0]:
CATEGORICAL_COLUMNS = []
NUMERIC_COLUMNS = [
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "rooms_per_person"
]

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train_df[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [0]:
def make_input_fn(data_df, label_df, num_epochs, shuffle, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

## Training

In [0]:
train_targets = train_df.loc[:, "median_house_value"]
train_features = train_df.drop("median_house_value", axis=1)
train_input_fn = make_input_fn(train_features, train_targets, num_epochs=10, shuffle=True)

In [14]:
estimator = tf.estimator.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[10, 10]
)
estimator.train(train_input_fn, steps=100)
result = estimator.evaluate(train_input_fn)
print(result)

W0504 16:57:09.365365 140206609041280 estimator.py:1799] Using temporary folder as model directory: /tmp/tmpmsz_rnej
W0504 16:57:09.391799 140206609041280 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/training_util.py:238: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
W0504 16:57:09.438990 140206609041280 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1257: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0504 16:57:09.450757 140206609041280 deprecation.py:323] From 

{'average_loss': 17688.791, 'label/mean': 207.67319, 'loss': 17679.326, 'prediction/mean': 195.61778, 'global_step': 100}


In [15]:
train_predictions = estimator.predict(make_input_fn(
    train_features, 
    train_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
train_predictions = [i["predictions"][0] for i in train_predictions]
rmse = math.sqrt(metrics.mean_squared_error(train_predictions, train_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 132.9993039333784


## Validation

In [16]:
validation_targets = validation_df.loc[:, "median_house_value"]
validation_features = validation_df.drop("median_house_value", axis=1)
validation_predictions = estimator.predict(make_input_fn(
    validation_features, 
    validation_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
validation_predictions = [i["predictions"][0] for i in validation_predictions]
rmse = math.sqrt(metrics.mean_squared_error(validation_predictions, validation_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 131.5787335797126


In [17]:
calibration_data = pd.DataFrame()
calibration_data["validation_predictions"] = pd.Series(validation_predictions)
calibration_data["validation_targets"] = pd.Series(validation_targets.values)
calibration_data.describe()

Unnamed: 0,validation_predictions,validation_targets
count,5000.0,5000.0
mean,197.795688,206.419624
std,92.561216,115.028183
min,74.408089,14.999
25%,138.922508,121.075
50%,180.338554,178.3
75%,227.539032,263.125
max,1430.186279,500.001


## Testing

In [18]:
test_data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv", sep=",")
test_data.shape

(3000, 9)

In [19]:
test_data["median_house_value"] /= 1000.0
test_data["rooms_per_person"] = test_data["total_rooms"] / test_data["population"]
test_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205.846275,1.96632
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113.119687,1.09993
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22.5,0.035955
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121.2,1.513472
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177.65,1.922678
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263.975,2.294729
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500.001,33.843373


In [20]:
test_targets = test_data.loc[:, "median_house_value"]
test_features = test_data.drop("median_house_value", axis=1)
test_predictions = estimator.predict(make_input_fn(
    test_features, 
    test_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
test_predictions = [i["predictions"][0] for i in test_predictions]
rmse = math.sqrt(metrics.mean_squared_error(test_predictions, test_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 133.36618070422168


In [21]:
calibration_data = pd.DataFrame()
calibration_data["test_predictions"] = pd.Series(test_predictions)
calibration_data["test_targets"] = pd.Series(test_targets.values)
calibration_data.describe()

Unnamed: 0,test_predictions,test_targets
count,3000.0,3000.0
mean,195.444145,205.846275
std,98.658077,113.119687
min,70.488144,22.5
25%,137.259701,121.2
50%,176.626671,177.65
75%,225.5555,263.975
max,1767.188232,500.001


## Keras

In [57]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(
    10, 
    activation="linear", 
    input_shape=(train_features.shape[1],)))
model.add(tf.keras.layers.Dense(10, activation="linear"))
model.add(tf.keras.layers.Dense(1, activation="linear"))
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.009),
    loss="mse",
    metrics=["mse"])
model.fit(
    train_features.values, 
    train_targets.values, 
    epochs=50, 
    batch_size=100,
#     validation_data=(validation_features.values, validation_targets.values)
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f83f757be48>

In [58]:
train_predictions = model.predict(train_features.values, batch_size=32)
rmse = math.sqrt(metrics.mean_squared_error(train_predictions, train_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 77.48481292668825


In [59]:
validation_predictions = model.predict(validation_features.values, batch_size=32)
rmse = math.sqrt(metrics.mean_squared_error(validation_predictions, validation_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 76.62370086919215


In [60]:
test_predictions = model.predict(test_features.values, batch_size=32)
rmse = math.sqrt(metrics.mean_squared_error(test_predictions, test_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 77.18837450490312
