# Section 3.1 
Covers
* Idea of Linear Regression
* Vectorization for Speed
* Normal Distribution and Squared Loss
* Idea of Neural Networks and Biological Inspiration

# Section 3.2 
* Implemenation of Linear Regression from Scratch
* Generate a small synthetic dataset and do regression to recover the known parameter

In [3]:
import torch
import random

In [26]:
def synthetic_data(w, b, num_examples):
    X = torch.normal(0, 1, (num_examples, w.shape[0]))
    y = torch.matmul(X, w) + b
    y += torch.normal(0, 0.01, y.shape)
    return X, y.reshape((-1,1))    

true_w = torch.tensor([-2, 3.4])
true_b = 11
features, labels = synthetic_data(true_w, true_b, 1000)

<font color='red'> torch.mm or torch.matmul difference? one supports broadcasting </font>

* **torch.mm** is straightforward matrix multiplication. Both args must be 2-D tensors. No Frills. No broadcasting
* **torch.matmul** fancy version. takes either 1 or 2 D for both args. does broadcasting.


## NOTE
We want to read data in reasonably sized mini batches because we want to take advantage of the GPU hardware. 
Secondly we don't want the minibatches to be too large because we are trying to do an estimate of the gradient.

In [29]:
# data iterator for getting minibatches of the data
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(
            indices[i: min(i + batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]

In [31]:
batch_size = 5
for X, y in data_iter(batch_size, features, labels):
    print(X, '\n', y)
    break

tensor([[ 0.2329,  0.6514],
        [ 0.8974, -0.4637],
        [ 0.6997,  0.0716],
        [ 0.6129,  0.1667],
        [-0.1680,  0.9509]]) 
 tensor([[12.7633],
        [ 7.6367],
        [ 9.8422],
        [10.3399],
        [14.5695]])


In [35]:
# initialize parameter of the linear model
w = torch.normal(0, 0.01, size=(2,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

In [36]:
def linreg(X, w, b):  #@save
    """The linear regression model."""
    return torch.matmul(X, w) + b

In [37]:
def squared_loss(y_hat, y):  #@save
    """Squared loss."""
    return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2

In [38]:
def sgd(params, lr, batch_size):  #@save
    """Minibatch stochastic gradient descent."""
    for param in params:
        param.data.sub_(lr*param.grad/batch_size)
        param.grad.data.zero_()

In [39]:
lr = 0.03
num_epochs = 3
net = linreg
loss = squared_loss

for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y)  # Minibatch loss in `X` and `y`
        # Compute gradient on `l` with respect to [`w`, `b`]
        l.sum().backward()
        sgd([w, b], lr, batch_size)  # Update parameters using their gradient
    with torch.no_grad():
        train_l = loss(net(features, w, b), labels)
        print(f'epoch {epoch + 1}, loss {float(train_l.mean()):f}')

epoch 1, loss 0.000483
epoch 2, loss 0.000052
epoch 3, loss 0.000052


In [40]:
print(f'error in estimating w: {true_w - w.reshape(true_w.shape)}')
print(f'error in estimating b: {true_b - b}')

error in estimating w: tensor([-0.0001, -0.0001], grad_fn=<SubBackward0>)
error in estimating b: tensor([-0.0005], grad_fn=<RsubBackward1>)
