In [66]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Basics

In [2]:
x = tf.Variable(3, name='x')
y = tf.Variable(4, name='y')

In [3]:
f = x*x*y + y + 2

In [4]:
type(f)

tensorflow.python.framework.ops.Tensor

In [5]:
type(x)

tensorflow.python.ops.variables.Variable

In [9]:
sess = tf.Session()
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)
result

42

In [10]:
with tf.Session() as sess:
    x.initializer.run()
    y.initializer.run()
    result = f.eval()
    
result

42

In [12]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    result = f.eval()
    
result

42

In [16]:
sess = tf.InteractiveSession()
init.run()
result = f.eval()
result

42

In [17]:
sess.close()

In [18]:
x1 = tf.Variable(1)
x1.graph is tf.get_default_graph()

True

In [19]:
graph = tf.Graph()
with graph.as_default():
    x2 = tf.Variable(2)
    
x2.graph is graph, x2.graph is tf.get_default_graph()

(True, False)

In [20]:
tf.reset_default_graph()

In [25]:
w = tf.constant(3)  # constants don't need initialization
x = w + 2
y = x + 5
z = x + 3

In [26]:
with tf.Session() as sess:
    print(y.eval())
    print(x.eval())

10
5


In [28]:
with tf.Session() as sess:
    y_val, z_val = sess.run([y, z])
    print(y_val)
    print(z_val)

10
8


# Linear regression via TensorFlow

In [72]:
import numpy as np
from sklearn.datasets import fetch_california_housing

In [81]:
housing = fetch_california_housing()

In [82]:
m, n = housing.data.shape  # remember that the book uses m for the number of observations and n for the number of features
m, n

(20640, 8)

In [88]:
scaled_housing_data = StandardScaler().fit_transform(housing.data)
scaled_housing_data.shape

(20640, 8)

In [89]:
# add a column of ones for intercept
scaled_housing_data_plus_bias = np.c_[np.ones((m, 1)), scaled_housing_data]
scaled_housing_data_plus_bias.shape

(20640, 9)

In [90]:
housing.target.shape, housing.target.reshape(-1, 1).shape

((20640,), (20640, 1))

In [91]:
X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # translate 1-D array to column vector w/ reshape

In [92]:
# use Normal equation to solve analytically for values of theta
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

In [93]:
with tf.Session() as sess:
    theta_value = theta.eval()

In [94]:
theta_value

array([[ 2.06856298],
       [ 0.82961965],
       [ 0.11875178],
       [-0.26552707],
       [ 0.30569667],
       [-0.00450281],
       [-0.03932635],
       [-0.8998825 ],
       [-0.87053877]], dtype=float32)

We could just compute theta using normal numpy matrix operations. The example above shows how to do this with TF. In addition, if we had a GPU, it'd do the calculations on the GPU instead of on the CPU.

# Gradient descent manually with TensorFlow

Or, instead of solving analytically, we can use gradient descent from TensorFlow to find the values of theta.

In [95]:
n_epochs = 1000
learning_rate = 0.01

In [96]:
X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta') # start with random vals for each member of theta (i think)
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
gradients = 2/m * tf.matmul(tf.transpose(X), error)  # manually computed gradient
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

In [97]:
with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE = ", mse.eval())
        sess.run(training_op)
        
    best_theta = theta.eval()
    
best_theta

Epoch 0 MSE =  8.19688
Epoch 100 MSE =  0.763312
Epoch 200 MSE =  0.594261
Epoch 300 MSE =  0.571749
Epoch 400 MSE =  0.558684
Epoch 500 MSE =  0.549367
Epoch 600 MSE =  0.542613
Epoch 700 MSE =  0.537703
Epoch 800 MSE =  0.53413
Epoch 900 MSE =  0.531527


array([[  2.06855226e+00],
       [  7.57593811e-01],
       [  1.31794810e-01],
       [ -7.97908083e-02],
       [  1.30270973e-01],
       [  1.17609208e-03],
       [ -3.88937108e-02],
       [ -8.72431338e-01],
       [ -8.32301676e-01]], dtype=float32)

# Gradient descent with autodiff

Same as above, with just a single line different: we don't need to calculate the gradient analytically first... instead, TF will do it for us.

In [98]:
X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta') # start with random vals for each member of theta (i think)
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')

# Old
#gradients = 2/m * tf.matmul(tf.transpose(X), error)  # manually computed gradient

# New
gradients = tf.gradients(mse, [theta])[0]

training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

In [99]:
with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE = ", mse.eval())
        sess.run(training_op)
        
    best_theta = theta.eval()
    
best_theta

Epoch 0 MSE =  8.6127
Epoch 100 MSE =  0.692715
Epoch 200 MSE =  0.586776
Epoch 300 MSE =  0.570046
Epoch 400 MSE =  0.558734
Epoch 500 MSE =  0.550354
Epoch 600 MSE =  0.544109
Epoch 700 MSE =  0.539435
Epoch 800 MSE =  0.535924
Epoch 900 MSE =  0.533275


array([[ 2.06855249],
       [ 0.86377585],
       [ 0.14774553],
       [-0.28780937],
       [ 0.30554113],
       [ 0.00560137],
       [-0.04250726],
       [-0.65645677],
       [-0.62904358]], dtype=float32)

# Using an optimizer instead of manually implementing gradient descent

In [103]:
X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')
theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta') # start with random vals for each member of theta (i think)
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')

gradients = tf.gradients(mse, [theta])[0]

# Old op
#training_op = tf.assign(theta, theta - learning_rate * gradients)

# New op
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [104]:
with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE = ", mse.eval())
        sess.run(training_op)
        
    best_theta = theta.eval()
    
best_theta

Epoch 0 MSE =  8.296
Epoch 100 MSE =  0.681659
Epoch 200 MSE =  0.548918
Epoch 300 MSE =  0.542588
Epoch 400 MSE =  0.539258
Epoch 500 MSE =  0.536589
Epoch 600 MSE =  0.534417
Epoch 700 MSE =  0.532641
Epoch 800 MSE =  0.531187
Epoch 900 MSE =  0.529995


array([[  2.06855226e+00],
       [  8.96659851e-01],
       [  1.37024999e-01],
       [ -3.81520927e-01],
       [  3.96921396e-01],
       [  1.27420318e-03],
       [ -4.23014536e-02],
       [ -7.03722835e-01],
       [ -6.81664765e-01]], dtype=float32)

# Implement mini-batch gradient descent, also output for TensorBoard

In [114]:
# Old X and y defn
#X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name='X')
#y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y')

# New X and y defn, and batch size
X = tf.placeholder(tf.float32, shape=(None, n+1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name='y')
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

theta = tf.Variable(tf.random_uniform([n+1, 1], -1.0, 1.0), name='theta') # start with random vals for each member of theta (i think)
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')

gradients = tf.gradients(mse, [theta])[0]

# Old op
#training_op = tf.assign(theta, theta - learning_rate * gradients)

# New op
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

# output for TensorBoard
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [115]:
def fetch_batch(epoch, batch_index, batch_size):
    np.random.seed(epoch * n_batches + batch_index)  
    indices = np.random.randint(m, size=batch_size)  
    X_batch = scaled_housing_data_plus_bias[indices] 
    y_batch = housing.target.reshape(-1, 1)[indices] 
    return X_batch, y_batch    

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        # Now use batches
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            
            # and output for TensorBoard
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X: X_batch, y: y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
    
        sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()
    
best_theta

file_writer.close()