In [1]:
import tensorflow as tf
import numpy as np
import math
import matplotlib.pyplot as plt

$$ y = 5 + 1.2*x -3.4*x^2 + 5.6*x^3 + ... $$

In [2]:

'''
labels = y; 
poly_features = x and its poly featues
true_w = coefficients
features = random variable '''

max_degrees = 20
n_train, n_test = 100, 100 
true_w = np.zeros(max_degrees)  # Allocate lots of space
true_w[:4] = [5, 1.2, -3.4, 5.6]  # 

features = np.random.normal(size = (n_train + n_test, 1))
np.random.shuffle(features)
poly_features = np.power(features, np.arange(max_degrees).reshape(1, -1))
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)


In [3]:
# Convert from NumPy ndarrays to tensors
true_w, features, poly_features, labels = [tf.constant(x, dtype=
    tf.float32) for x in [true_w, features, poly_features, labels]]

### Will overfit the model and then check power of regularization

In [4]:
def load_data(data, batch_size, is_train = False):
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.shuffle(buffer_size= 100)
    dataset = dataset.batch(batch_size)
    return dataset

In [5]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.05)
loss = tf.keras.losses.MeanSquaredError()

net = tf.keras.Sequential([
    tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.RandomNormal(stddev  = 0.1))
])

num_epochs, batch_size, lr = 50, 10, 0.05

data_iter = load_data((poly_features[:n_train, :], labels[:n_train]), batch_size, is_train = True)
training_loss = []
for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = net(X)
            mse_loss = loss(yhat, y)
        
        grads = tape.gradient(mse_loss, net.trainable_variables)

        optimizer.apply_gradients(zip(grads, net.trainable_variables))

    epoch_loss = loss(net(poly_features[:n_train, :]), labels[:n_train])
    training_loss.append(epoch_loss)
    print(f"epoch : {epoch}, training_loss : {epoch_loss}")

epoch : 0, training_loss : 42848067584.0
epoch : 1, training_loss : 134169427968.0
epoch : 2, training_loss : 266419142656.0
epoch : 3, training_loss : 18056912896.0
epoch : 4, training_loss : 115343327232.0
epoch : 5, training_loss : 38512803840.0
epoch : 6, training_loss : 8449315328.0
epoch : 7, training_loss : 7957939712.0
epoch : 8, training_loss : 9280643072.0
epoch : 9, training_loss : 5912075776.0
epoch : 10, training_loss : 4672243712.0
epoch : 11, training_loss : 3987857664.0
epoch : 12, training_loss : 3440613120.0
epoch : 13, training_loss : 2930254080.0
epoch : 14, training_loss : 2612780800.0
epoch : 15, training_loss : 2198679296.0
epoch : 16, training_loss : 2302754560.0
epoch : 17, training_loss : 2211642112.0
epoch : 18, training_loss : 1501234304.0
epoch : 19, training_loss : 1186942720.0
epoch : 20, training_loss : 657969408.0
epoch : 21, training_loss : 16661351424.0
epoch : 22, training_loss : 46075080704.0
epoch : 23, training_loss : 7378688000.0
epoch : 24, trai

In [6]:
norm_zero_lambda = tf.norm(net.get_weights()[0]).numpy()

### Regularize the above model

In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.05)
loss = tf.keras.losses.MeanSquaredError()

net = tf.keras.Sequential([
    tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.RandomNormal(stddev  = 0.1))
])

def l2_penalty(w):
    return tf.reduce_sum(tf.pow(w, 2)) / 2

num_epochs, batch_size, lr, lambd = 50, 10, 0.05, 3

data_iter = load_data((poly_features[:n_train, :], labels[:n_train]), batch_size, is_train = True)
training_loss = []
for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = net(X)
            mse_loss = loss(yhat, y) + lambd*l2_penalty(net.trainable_variables[0])
        
        grads = tape.gradient(mse_loss, net.trainable_variables)

        optimizer.apply_gradients(zip(grads, net.trainable_variables))

    epoch_loss = loss(net(poly_features[:n_train, :]), labels[:n_train])
    training_loss.append(epoch_loss)
    print(f"epoch : {epoch}, training_loss : {epoch_loss}")

epoch : 0, training_loss : 2107365130240.0
epoch : 1, training_loss : 9230185472.0
epoch : 2, training_loss : 1306311131136.0
epoch : 3, training_loss : 2281608704.0
epoch : 4, training_loss : 131580116992.0
epoch : 5, training_loss : 77246734336.0
epoch : 6, training_loss : 79104098304.0
epoch : 7, training_loss : 247983712.0
epoch : 8, training_loss : 981275840.0
epoch : 9, training_loss : 334350656.0
epoch : 10, training_loss : 122329152.0
epoch : 11, training_loss : 125269320.0
epoch : 12, training_loss : 201294112.0
epoch : 13, training_loss : 137106544.0
epoch : 14, training_loss : 76828536.0
epoch : 15, training_loss : 627702528.0
epoch : 16, training_loss : 389540096.0
epoch : 17, training_loss : 210110688.0
epoch : 18, training_loss : 2044560512.0
epoch : 19, training_loss : 4505469440.0
epoch : 20, training_loss : 7682944512.0
epoch : 21, training_loss : 16359603200.0
epoch : 22, training_loss : 34123872256.0
epoch : 23, training_loss : 39241134080.0
epoch : 24, training_loss

In [8]:
norm_with_lambda = tf.norm(net.get_weights()[0]).numpy()

In [9]:
print(norm_with_lambda, norm_zero_lambda)

0.44223258 1.2648956


## Alternate Implementation

In [11]:
%matplotlib inline
from d2l import tensorflow as d2l
import tensorflow as tf

In [19]:
n_train, n_test, num_inputs, batch_size = 20, 100, 200, 5
true_w, true_b = tf.ones((num_inputs, 1)) * 0.01, 0.05
train_data = d2l.synthetic_data(true_w, true_b, n_train)
train_iter = d2l.load_array(train_data, batch_size)
test_data = d2l.synthetic_data(true_w, true_b, n_test)
test_iter = d2l.load_array(test_data, batch_size, is_train=False)

In [20]:
def init_params():
    w = tf.Variable(tf.random.normal(mean=1, shape=(num_inputs, 1)))
    b = tf.Variable(tf.zeros(shape=(1, )))
    return [w, b]

In [21]:
def l2_penalty(w):
    return tf.reduce_sum(tf.pow(w, 2)) / 2

In [30]:
def train(lambd):
    w, b = init_params()
    net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss
    num_epochs, lr = 100, 0.003
   
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with tf.GradientTape() as tape:
                l = loss(net(X), y) + lambd * l2_penalty(w)
            grads = tape.gradient(l, [w, b])
            d2l.sgd([w, b], grads, lr, batch_size)
    print('L2 norm of w:', tf.norm(w).numpy())

In [31]:
train(lambd=3)

L2 norm of w: 0.572348


In [32]:
train(lambd=0)

L2 norm of w: 19.757078
