In [1]:
import pandas as pd

df_data_1 = pd.read_csv('cal_housing_data with headers.csv')
df_data_1.head()

Unnamed: 0,Longitude,Latitude,HousingMedianAge,TotalRooms,TotalBedrooms,Population,Households,MedianIncomeValue,MedianHouseValue
0,-122.23,37.88,41,880,129,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500
2,-122.24,37.85,52,1467,190,496,177,7.2574,352100
3,-122.25,37.85,52,1274,235,558,219,5.6431,341300
4,-122.25,37.85,52,1627,280,565,259,3.8462,342200


In [2]:
import numpy as np

# Make a numpy array from the dataframe
data = np.array([x for x in df_data_1.values])

# Separate the 'predictors' (aka 'features') from the dependent variable (aka 'label') 
# that we will learn how to predict
housing_data = np.delete(data, 8, axis=1)
housing_target = np.delete(data, slice(0, 8), axis=1)

In [3]:
m, n = housing_data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing_data]

In [4]:
import tensorflow as tf

# Make the compute graph
X = tf.constant(housing_data_plus_bias, dtype=tf.float64, name="X")
XT = tf.transpose(X)
y = tf.constant(housing_target.reshape(-1, 1), dtype=tf.float64, name="y")

theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

# Run the compute graph
with tf.Session() as sess:
    theta_value = theta.eval()

In [5]:
# For fun, show the linear regression model (i.e. the coefficients of the linear equation)
theta_value

array([[ -3.59402294e+06],
       [ -4.28237438e+04],
       [ -4.25767219e+04],
       [  1.15630387e+03],
       [ -8.18164928e+00],
       [  1.13410689e+02],
       [ -3.85350953e+01],
       [  4.83082868e+01],
       [  4.02485142e+04]])

In [6]:
# Make a subdirectory in which to save the model
!mkdir "../datasets/Linear Regression"

mkdir: ../datasets/Linear Regression: File exists


In [7]:
# Save the model
model = tf.Variable(tf.constant(theta_value, dtype=tf.float64), name="model")

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as saver_sess:
    init.run()
    theta_value = model.eval()
    save_path = saver.save(saver_sess, "../datasets/Linear Regression/Linear Regression.ckpt")

In [8]:
# List the files that comprise the saved model
!ls "../datasets/Linear Regression"

Linear Regression.ckpt.data-00000-of-00001
Linear Regression.ckpt.index
Linear Regression.ckpt.meta
checkpoint


In [9]:
# Restore the saved model 
# NOTE: This should run on inference service initialization, not on every inference

sess_restore = tf.Session()

saver = tf.train.import_meta_graph('../datasets/Linear Regression/Linear Regression.ckpt.meta')
saver.restore(sess_restore,tf.train.latest_checkpoint('../datasets/Linear Regression/'))

theta_value = sess_restore.run('model:0')

sess_restore.close()

INFO:tensorflow:Restoring parameters from ../datasets/Linear Regression/Linear Regression.ckpt


In [10]:
# For fun, show the linear regression model again
theta_value

array([[ -3.59402294e+06],
       [ -4.28237438e+04],
       [ -4.25767219e+04],
       [  1.15630387e+03],
       [ -8.18164928e+00],
       [  1.13410689e+02],
       [ -3.85350953e+01],
       [  4.83082868e+01],
       [  4.02485142e+04]])

In [11]:
# Now we'll do an inference to predict a value with the model
# We will use house_data[0] as if it had been received as input to the inference service

# TODO: This can be rewritten as TensorFlow code at some point, but that would be more typical of 
#       larger models. At only 9 iterations, this would likely be slower as TensorFlow code

# Start by setting the predicted value equal to the linear equation's constant term
predicted_value = theta_value[0][0]

# Get the coefficients of the features (i.e. exclude the constant term accounted for above)
coefficients = theta_value[1:]

# For each feature (independent variable), add to the predicted value the product
# of the coefficient for the feature (c = theta_value[j+1]) and the j^th feature of
# the inference service input data (represented by housing_data[0])
for j, c in enumerate(coefficients):
    predicted_value += c[0] * housing_data[0][j]

In [12]:
# For fun, show the predicted value
predicted_value

411111.09606514324

In [13]:
# This is some earlier code written to do the predictions on all items of training data,
# but this batch processing of predictions is NOT the baseline use case exepcted for CP10 and CP47 services

# Start by setting each predicted value equal to the linear equation's constant term
predicted_values = np.full((m), theta_value[0][0])

# Get the coefficients of the features (i.e. exclude the constant term accounted for above)
coefficients = theta_value[1:]

# For each of the m rows of housing data, update the predicted value (y) as follows:
    # For each feature (independent variable), add to the predicted value the product
    # of the coefficient for the feature (c = theta_value[j+1]) and the i^th row's
    # housing data value for the jth feature

for i, x in enumerate(housing_data):
    for j, c in enumerate(coefficients):
        predicted_values[i] += c * x[j]

In [14]:
# For fun, show the batch of predicted values
predicted_values

array([ 411111.09606514,  416144.49078677,  380432.65417531, ...,
         25026.16974547,   37991.19625605,   55550.98309601])

In [15]:
## For when you want to wipe out the training and do it again
# !rm -rf "../datasets/Linear Regression"

In [16]:
# Get a flattened version of the house prices to use in R2 calculations below
y_actual = np.ndarray.flatten(housing_target)
y_actual

array([ 452600.,  358500.,  352100., ...,   92300.,   84700.,   89400.])

In [17]:
# Calculate R^2 using the scikit learn function. This measures the quality of the regression model.
from sklearn.metrics import r2_score
R2 = r2_score(y_actual, predicted_values)
R2

0.63710562292234463

In [18]:
# Now we'll do it manually to help understand how R^2 characterizes regression model quality
# We start with the mean of the actual dependent variable
y_bar = np.mean(y_actual)
y_bar

206855.81690891474

In [19]:
# Now we'll compute the data set variance from the mean (total sum of squared differences)
SStot = 0.0
for y_i in y_actual:
    diff = float(y_i - y_bar)
    SStot += (diff * diff)
SStot

274831981936881.9

In [20]:
# Now we compute the amount that the regression model's predicted values vary from the mean.
# This is the sum of squared differences between the predicted values and the mean
SSreg = 0.0
for f_i in predicted_values:
    diff = float(f_i - y_bar)
    SSreg += (diff * diff)
SSreg

175097001050335.3

In [21]:
# The R squared is just the ratio. It gives the percentage of the variance from the mean 
# that is accounted for by using the regression model to predict values instead of just
# always using the mean as the predicted value for any observation in the group.
R_squared = SSreg / SStot
R_squared

0.6371056229203638

In [22]:
# A second way to think about this is to consider the amount of remaining error,
# i.e. the amount of remaining or 'residual' variance between the actual data points 
# and the regression model's predicted values
SSres = 0.0
for i, f_i in enumerate(predicted_values):
    diff = float(f_i - y_actual[i])
    SSres += (diff * diff)
SSres

99734980886003.83

In [23]:
# So R squared can also be computed based on the percentage of leftover (residual) variance
R_squared = 1.0 - SSres / SStot
R_squared

0.6371056229223386