<a href="https://colab.research.google.com/github/mgfrantz/CodingNomads-Intro-To-Deep-Learning/blob/master/Fundamentals/Our%20first%20neural%20network%20-%20linear%20regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports
import torch
from torch import nn, optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
def seed_all(seed=42):
    """
    Sets the numpy and torch random seed.
    """
    np.random.seed(seed)
    torch.random.seed = seed

seed_all()

# Single-Variable Linear Regression

In [None]:
# Create some X data
X = np.random.uniform(0, 10, 100)

In [None]:
# Define the slope (m), bias (b), and some noise we want to add to X to make y
m = 3
b = 1.8
noise = np.random.normal(scale=3, size=100) # add this so we don't have a perfect line relationship

### Exercise 2.1: Create your `y` variable

In the cell above, we've created the slope `m`, the intercept `b`, and a noise factor `noise`. 
In the cell below, use multiplication and addition to create your `y` variable.

In [None]:
# Your code here
y = ...

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.scatter(X, y)

### Exercise 2.2: Create a function that returns MSE

Look at some of the functions in `torch`.
Based on what you've learned in the lesson, create a function that returns the mean squared error for a `torch.Tensor` of `predictions` and `actuals`.
There is a test case below so you can understand what the inputs and outputs should be.

In [None]:
# Define MSE
def mse(predictions:torch.Tensor, actuals:torch.Tensor) -> torch.Tensor:
    # your code here
    raise NotImplementedError("Implement MSE, then remove this line")

In [None]:
ys = torch.tensor([1,2,3])
yhats = torch.tensor([1.1, 2.1, 3.1])
assert isinstance(mse(ys, yhats), torch.Tensor), "The output of mse should be a torch.Tensor!"
assert torch.allclose(mse(ys, yhats), torch.tensor(0.01)), "The MSE should be about 0.01"

In [None]:
mse(ys, yhats)

In [None]:
# Now that we've defined MSE, let's just use Torch's.
mse_loss = nn.MSELoss()
mse_loss(ys, yhats)

In [None]:
# We can also use the functional API to calculate MSE
F.mse_loss(ys, yhats)

### Exercise 2.3: Based on your knowledge of `scikit-learn`, perform a linear regression to predict `y` from `X`.

Fill in the code to fit a linear regression model and find the coefficient(s) and intercept/bias.
Are your slope and intercept in the ballpark of the sope and intercept we defined earlier in the notebook?

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Fit a linear model on our data
lr = ... # Instantiate and fit a vanilla linear regression

In [None]:
# Display the slope and intercept
slope = ... # Your code here
intercept = ... # Your code here

assert isinstance(slope, np.ndarray)
assert isinstance(intercept, float)

print(f'The slope is {str(slope)} and the intercept is {intercept}')

In [None]:
assert isinstance(intercept, float)

In [None]:
# Calculate the mean squared error
predictions = ... # Your code here
lr_mse = mean_squared_error(y, predictions)

print(f'The mean squared error from our linear regression is {lr_mse}')

In [None]:
# Plot our line of best fit
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.scatter(X, y)
_x = np.arange(0, 10)
_y = _x * lr.coef_[0] + lr.intercept_
ax.plot(_x, _y, c='red', label=f"Line of best fit")
ax.legend()

In [None]:
# Because we're in torch now, let's just turn X and y into tensors.
X = torch.tensor(X)
y = torch.tensor(y)

## Exercise 2.4: Complete the `forward` method

Based on your knowledge of single-variable linear models and object-oriented programming, complete the `forward` method in the `LinReg` class.
This method should take the input X, multiply it by the `slope` class attribute, and add the `bias` class attribute.

In [None]:
# Build our linear regression model
class LinReg(nn.Module):
    def __init__(self):
        super().__init__()
        # Randomly initialize 2 parameters, one for our slope and one for our bias.
        self.slope = nn.Parameter(torch.rand(1))
        self.bias = nn.Parameter(torch.rand(1))

    def forward(self, X):
        return ...

In [None]:
lr = LinReg()

## Exercise 2.5: Modifying the learning rate and number of epochs

Change the number of epochs `N_EPOCHS` and learning rate `LR` variables.
What do you observe about the loss over time and  when the learning rate becomes too large or too small?

In [None]:
N_EPOCHS = 300
LR = 1e-3

In [None]:
slopes = []
biases = []
losses = []
_alphas = []
for i in range(N_EPOCHS):
    # Make some inferences
    yhat = lr(X)
    # Measure how bad those guesses were
    loss = F.mse_loss(yhat, y)
    if i%(N_EPOCHS/10)==0:
        print(f"Epoch {i} Train Loss: {loss:.04f}")
    # Calculate the gradient of all the parameters with respect to the loss
    loss.backward()
    # Apply the SGD update rule
    lr.slope.data.sub_(lr.slope.grad * LR)
    lr.bias.data.sub_(lr.bias.grad * LR)
    # Zero out the gradients for the next round
    lr.slope.grad.zero_()
    lr.bias.grad.zero_()

    # Record the parameters and losses so we can plot them out later
    slopes.append(float(lr.slope.data.detach().numpy()))
    biases.append(float(lr.bias.data.detach().numpy()))
    losses.append(float(loss.detach().numpy()))
    _alphas.append(i/N_EPOCHS)

In [None]:
lr.slope, lr.bias

Let's plot what we've done so far.
The blue dots represent the data, and the red lines represent the functions created by our slopes and biases as the model learns.
The more transparent lines are the first iterations in our training loop.
Notice that as the lines get more solid, they fit the data better.
This illustrates the process of our model learning the data!

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.scatter(X, y)
for s, b, a in zip(slopes, biases, _alphas):
    _x = np.arange(0, 10)
    _y = _x * s + b
    ax.plot(_x, _y, alpha=a, c='red', label=f"Epoch {int(a)}")


In [None]:
fig, ax = plt.subplots(figsize=(14,10))
ax.plot(losses)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss (MSE)')
if (losses[-1] > losses[0]) | np.isnan(losses[-1]):
    ax.set_title('Diverging - BAD!')
else:
    ax.set_title('Converging - goood!')

<!-- split -->

# Multi-varable linear regression

In [None]:
# Make yet another fake dataset
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=3, n_informative=2, bias=3, noise=2)

In [None]:
# No more bad habits, we need to split our data.
X_train, X_valid, y_train, y_valid = (torch.tensor(i).float() for i in train_test_split(X, y, test_size=0.1, random_state=42))

## Exercise 2.6: Create `weights` and `bias` tensors

In the cell below, create a `weights` tensor and a `bias` tensor.
For both of these tensors, use the [`torch.rand`](https://pytorch.org/docs/stable/generated/torch.rand.html) function.
The `weights` tensor should have as many values as `X_train` has features.
The `bias` tensor should just be a single random value.

Once you've created these tensors, turn them into parameters using the [`nn.Parameter`](https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html) class.
If you need a reference here, look at how this was done when we created the `LinReg` class in Exercise 2.4.

In [None]:
# Let's create some temporary weights and biases and test out our matrix operations before we build our model.
# Create a weights parameter with 1 beta per column in X
weights = ...
# Create our bias parameter
bias = ...

In [None]:
# Test out the operation we want to perform in the forward pass
torch.matmul(X_train[:10], weights) + bias

In [None]:
# FYI: @ does the same thing as matmul in this context and is easier
X_train[:10]@weights + bias

In [None]:
# Sanity check: different implementations of our forward pass are the same
assert (X_train@weights + bias == torch.matmul(X_train, weights) + bias).all()

In [None]:
# Let's make our model
class LinRegMulti(nn.Module):
    def __init__(self, n_cols):
        super().__init__()
        self.n_cols = n_cols

        self.weights = nn.Parameter(torch.rand(self.n_cols))
        self.bias = nn.Parameter(torch.rand(1))
    
    def forward(self, X):
        return X@self.weights.T + self.bias

In [None]:
N_EPOCHS = 10000
LR = 1e-3

In [None]:
lrm = LinRegMulti(X_train.shape[1])

In [None]:
# Instead of updating each parameter individually, let's make an update rule function.
def gd_update_rule(parameters, lr):
    parameters.data.sub_(parameters.grad * lr)
    parameters.grad.zero_()

In [None]:
train_losses = []
valid_losses = []

In [None]:
for i in range(N_EPOCHS):
    yhat = lrm(X_train)
    loss = mse(yhat, y_train)
    loss.backward()
    for p in lrm.parameters():
        gd_update_rule(p, LR)
    train_losses.append(loss.detach().numpy())

    with torch.no_grad():
        yhat = lrm(X_valid)
        valid_loss = mse(yhat, y_valid)
        valid_losses.append(valid_loss.numpy())

    if i%(N_EPOCHS/10) == 0:
        print(f"Epoch {i} Train Loss: {loss:.04f}, Valid Loss: {valid_loss:.04f}")

In [None]:
EPOCHS_TO_SHOW = 2000
fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(train_losses[:EPOCHS_TO_SHOW], label='Train', linewidth=3, alpha=0.5)
ax.plot(valid_losses[:EPOCHS_TO_SHOW], ls='--', label='Valid')
ax.legend()

In [None]:
lrm.weights

In [None]:
lrm.bias

## The `Linear` layer


In [None]:
class Linear(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.weights = nn.Parameter(torch.rand((dim_in, dim_out)))
        self.bias = nn.Parameter(torch.rand(dim_out))
    
    def forward(self, X):

        return X@self.weights + self.bias

In [None]:
# Let's compare our Linear class with nn.Linear
l1 = Linear(3, 5)
l2 = nn.Linear(3, 5)

In [None]:
l2.weight

In [None]:
l1.weights

In [None]:
l1.weights.data.copy_(l2.weight.T)
l1.bias.data.copy_(l2.bias)

In [None]:
l1(X_train[:5])

In [None]:
l2(X_train[:5])

In [None]:
assert (l1(X_train[:5]) == l2(X_train[:5])).all()

Sometimes `torch`'s implementations with some optimizations that make operations run faster.
We'll see examples of this later in the course.
I was just curious to see whether `torch`'s implementation was significantly faster than ours using `%%timeit`.

In [None]:
%%timeit 
l1(X_train[:5])

In [None]:
%%timeit
 l2(X_train[:5])

## Nonlinearities (activation functions)

In [None]:
rng = torch.arange(-5, 5.01, 0.05)
fig, ax = plt.subplots(figsize=(14, 8))
ax.plot(rng, F.relu(rng), label='ReLU')
ax.plot(rng, torch.tanh(rng), label='tanh')
ax.plot(rng, torch.sigmoid(rng), label='sigmoid')
ax.plot(rng, F.leaky_relu(rng, negative_slope=0.01), ls='--', label='leaky ReLU')
ax.set_ylim(-1.1, 1.1)
ax.set_title('Common activation functions')
ax.legend()

## Exercise 2.7: Explore our neural network

In this exercise, use the code below to explore different aspects of our nerual netowrk.
For each of the following scenarios, inspect the plots of the train/validaiton loss at the end of the notebook and record what happens with each experient you do.
Try to...
* Switch out `nn.Linear` for the `Linear` class we defined earlier. Do these networks learn differently?
* What happens if if we use `MultiLayerRegressor` (the subclassing API) vs. the `multilayer_regressor` function (the sequential API)? Are these equivalent?
* What happens as you increase the `hiiden_dim`? 
* Remove the nonlinearity. Does your network learn as well?
* Try adding at least 1 layer in between the input and output layers of the network, with a nonlinearity. **NOTE:** Since we're learning a really simple function, additional layers may not help our loss.

In [None]:
class MultiLayerRegressor(nn.Module):
    def __init__(self, dim_in, hidden_dim):
        super().__init__()
        # self.first_layer = Linear(dim_in, hidden_dim)
        self.first_layer = nn.Linear(dim_in, hidden_dim)
        # self.second_layer = Linear(hidden_dim, 1)
        self.second_layer = nn.Linear(hidden_dim, 1)
    
    def forward(self, X):
        x = self.first_layer(X)
        # x = relu(x)
        x = F.relu(x)
        x = self.second_layer(x)
        return x

In [None]:
def multilayer_regressor(in_dim, hidden_dim):
    return nn.Sequential(
        nn.Linear(in_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, 1)
    )

In [None]:
# mlr = MultiLayerRegressor(3, 4)
mlr = multilayer_regressor(3, 4)

In [None]:
train_losses = []
valid_losses = []

In [None]:
LR = 1e-3
N_EPOCHS = 20000

In [None]:
# Notice that instead of iterating through our parameters and applying
# an update rule, we're just using torch's built in SGD optimizer.
opt = optim.SGD(mlr.parameters(), lr=LR)

In [None]:
for i in range(N_EPOCHS):
    yhat = mlr(X_train).squeeze()
    loss = F.mse_loss(yhat, y_train)
    loss.backward()
    opt.step()
    opt.zero_grad()
    train_losses.append(loss.detach().numpy())

    with torch.no_grad():
        yhat = mlr(X_valid).squeeze()
        valid_loss = F.mse_loss(yhat, y_valid)
        valid_losses.append(loss.numpy())
    
    if i%(N_EPOCHS/10) == 0:
        print(f"Epoch {i} Train loss: {loss:.04f}, Valid loss: {valid_loss:.04f}")

In [None]:
idx=1000
fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(train_losses[:idx], label='Train', linewidth=3, alpha=0.5)
ax.plot(valid_losses[:idx], ls='--', label='Valid')
ax.legend()