In [1]:
import math
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l

### Generating the Dataset


(**$$y = 5 + 1.2x - 3.4\frac{x^2}{2!} + 5.6 \frac{x^3}{3!} + \epsilon \text{ where }
\epsilon \sim \mathcal{N}(0, 0.1^2).$$**)

The noise term $\epsilon$ obeys a normal distribution
with a mean of 0 and a standard deviation of 0.1.
For optimization, we typically want to avoid
very large values of gradients or losses.
This is why the *features*
are rescaled from $x^i$ to $\frac{x^i}{i!}$.
It allows us to avoid very large values for large exponents $i$.
We will synthesize 100 samples each for the training set and test set.


In [10]:
max_degree = 20 
n_train, n_test = 100, 100
true_w = np.zeros(max_degree)  # create a (1, 20) matrix
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])  # give 4 values in the first four cols

features = np.random.normal(size=(n_train + n_test, 1))  # create a (200, 1) matrix
np.random.shuffle(features)
poly_features = np.power(features, np.arange(max_degree).reshape(1, -1)) 
                # np.power(a,b) = a^b (element by element through broadcasting)

for i in range(max_degree):
    poly_features[:, i] /= math.gamma(i+1)
    
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)  # scale = variance; loc = average

[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]]


In [25]:
# np.power(a,b) = a^b (element by element through broadcasting)
# For testing
A = np.array([[1.0], [2.0], [3.0], [5.0], [8.0], [9.0]])
B = np.array([2.0, 3.0])
C = np.power(A, B)
print(A)
print(C)

[[1.]
 [2.]
 [3.]
 [5.]
 [8.]
 [9.]]
[[  1.   1.]
 [  4.   8.]
 [  9.  27.]
 [ 25. 125.]
 [ 64. 512.]
 [ 81. 729.]]
