# First Steps with TensorFlow

**Objectives:**
  * Use multiple features, instead of a single feature, to further improve the effectiveness of a model
  * Use a test data set to check if a model is overfitting the validation data

The [data](https://developers.google.com/machine-learning/crash-course/california-housing-data-description) is based on 1990 census data from California.

## Setup
In this first cell, we'll load the necessary libraries.

In [1]:
import math
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
if tf.__version__[0] == "1":
    !pip install tensorflow==2.0.0-alpha0
print(tf.__version__)

2.0.0-alpha0


In [0]:
data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

In [3]:
data.shape

(17000, 9)

In [4]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [5]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Preprocess

In [0]:
data["median_house_value"] /= 1000.0
data["rooms_per_person"] = data["total_rooms"] / data["population"]

In [7]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66.9,5.529064
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80.1,6.775908
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85.7,2.162162
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73.4,2.914563
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65.5,2.330128


## Split dataset

In [8]:
data = data.sample(frac=1).reset_index(drop=True)
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
0,-118.37,34.16,10.0,2606.0,748.0,1373.0,680.0,3.6128,225.0,1.898034
1,-117.98,34.05,33.0,1560.0,315.0,1467.0,313.0,4.1429,159.8,1.063395
2,-118.26,34.02,38.0,980.0,285.0,1308.0,310.0,1.5652,123.1,0.749235
3,-118.27,33.99,41.0,656.0,162.0,730.0,170.0,1.8047,101.8,0.89863
4,-121.24,38.75,5.0,9137.0,1368.0,3667.0,1294.0,5.4896,229.6,2.491683


In [9]:
train_df = data.loc[:12000, :]
train_df.shape

(12001, 10)

In [10]:
validation_df = data.loc[12000:, :]
validation_df.shape

(5000, 10)

In [0]:
CATEGORICAL_COLUMNS = []
NUMERIC_COLUMNS = [
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "rooms_per_person"
]

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train_df[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [0]:
def make_input_fn(data_df, label_df, num_epochs, shuffle, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

## Training

In [0]:
train_targets = train_df.loc[:, "median_house_value"]
train_features = train_df.drop("median_house_value", axis=1)
train_input_fn = make_input_fn(train_features, train_targets, num_epochs=10, shuffle=True)
# validation_input_fn = make_input_fn(features, targets, num_epochs=1, shuffle=False)

In [14]:
linear_est = tf.estimator.LinearRegressor(feature_columns=feature_columns)
linear_est.train(train_input_fn, steps=100)
result = linear_est.evaluate(train_input_fn)
print(result)

W0503 15:58:46.898151 140299310606208 estimator.py:1799] Using temporary folder as model directory: /tmp/tmpglghf7bb
W0503 15:58:46.923208 140299310606208 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/training_util.py:238: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
W0503 15:58:47.124980 140299310606208 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:2758: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
W0503 15:58:47.629937 140299310606208 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/slot_creator.py:1

{'average_loss': 15004.299, 'label/mean': 207.55344, 'loss': 15005.474, 'prediction/mean': 152.06339, 'global_step': 100}


In [15]:
train_predictions = linear_est.predict(make_input_fn(
    train_features, 
    train_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
train_predictions = [i["predictions"][0] for i in train_predictions]
rmse = math.sqrt(metrics.mean_squared_error(train_predictions, train_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 122.4920222259388


## Validation

In [16]:
validation_targets = validation_df.loc[:, "median_house_value"]
validation_features = validation_df.drop("median_house_value", axis=1)
validation_predictions = linear_est.predict(make_input_fn(
    validation_features, 
    validation_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
validation_predictions = [i["predictions"][0] for i in validation_predictions]
rmse = math.sqrt(metrics.mean_squared_error(validation_predictions, validation_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 121.60618486256071


In [17]:
calibration_data = pd.DataFrame()
calibration_data["validation_predictions"] = pd.Series(validation_predictions)
calibration_data["validation_targets"] = pd.Series(validation_targets.values)
calibration_data.describe()

Unnamed: 0,validation_predictions,validation_targets
count,5000.0,5000.0
mean,151.801297,206.694606
std,27.013387,115.151733
min,-685.111389,14.999
25%,140.385418,119.175
50%,153.428497,179.2
75%,165.528328,263.225
max,262.032227,500.001


In [18]:
calibration_data.head(50)

Unnamed: 0,validation_predictions,validation_targets
0,179.635178,197.5
1,133.281219,163.0
2,130.418365,159.1
3,144.723297,198.1
4,93.311195,165.3
5,149.096222,90.2
6,144.136307,184.8
7,137.930618,173.9
8,143.577881,140.3
9,157.499207,215.6


## Testing

In [19]:
test_data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv", sep=",")
test_data.shape

(3000, 9)

In [20]:
test_data["median_house_value"] /= 1000.0
test_data["rooms_per_person"] = test_data["total_rooms"] / test_data["population"]
test_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205.846275,1.96632
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113.119687,1.09993
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22.5,0.035955
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121.2,1.513472
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177.65,1.922678
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263.975,2.294729
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500.001,33.843373


In [21]:
test_targets = test_data.loc[:, "median_house_value"]
test_features = test_data.drop("median_house_value", axis=1)
test_predictions = linear_est.predict(make_input_fn(
    test_features, 
    test_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
test_predictions = [i["predictions"][0] for i in test_predictions]
rmse = math.sqrt(metrics.mean_squared_error(test_predictions, test_targets))
print("Train RMSE: {}".format(rmse))

Train RMSE: 119.08034447557628


In [22]:
calibration_data = pd.DataFrame()
calibration_data["test_predictions"] = pd.Series(test_predictions)
calibration_data["test_targets"] = pd.Series(test_targets.values)
calibration_data.describe()

Unnamed: 0,test_predictions,test_targets
count,3000.0,3000.0
mean,152.181362,205.846275
std,24.170347,113.119687
min,-106.642929,22.5
25%,140.160095,121.2
50%,152.858673,177.65
75%,165.411884,263.975
max,388.535339,500.001


In [23]:
calibration_data.head(50)

Unnamed: 0,test_predictions,test_targets
0,175.572327,344.7
1,163.822159,176.5
2,163.863419,270.5
3,150.933578,330.0
4,134.993027,81.7
5,153.949738,67.0
6,165.332352,67.0
7,138.864212,166.9
8,151.62381,194.4
9,95.590561,164.2
