For training a model, there are two initialization steps:

    Random initialization of parameters/weights (we have only two, a and b) — lines 3 and 4;
    Initialization of hyper-parameters (in our case, only learning rate and number of epochs) — lines 9 and 11;

Make sure to always initialize your random seed to ensure reproducibility of your results. As usual, the random seed is 42, the least random of all random seeds one could possibly choose :-)

For each epoch, there are four training steps:

    Compute model’s predictions — this is the forward pass — line 15;
    Compute the loss, using predictions and and labels and the appropriate loss function for the task at hand — lines 18 and 20;
    Compute the gradients for every parameter — lines 23 and 24;
    Update the parameters — lines 27 and 28;

Just keep in mind that, if you don’t use batch gradient descent (our example does),you’ll have to write an inner loop to perform the four training steps for either each individual point (stochastic) or n points (mini-batch). We’ll see a mini-batch example later down the line.

In [None]:
# For creating test and validation sets, it's important to
# shuffle the array of indices. That allows us to randomize both
# examples x and ground truth y in the same way:

import numpy as np

# Data Generation
np.random.seed(42)
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [None]:
# Linear regression using a sequential model

import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict

torch.manual_seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

lr = 1e-1
epochs = 1000

model = nn.Sequential(OrderedDict([('linear',nn.Linear(1,1,bias=True))])).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(1,epochs):
    optimizer.zero_grad()
    y_hat = model(x_train_tensor)
    error = y_train_tensor - y_hat
    loss = (error**2).mean()
    loss.backward()
    optimizer.step()

print(model[0].weight) # That's how we can access the first layer, in case its unnamed
print(model.linear.bias) # Alternatively, in case we named the layer
print()
print(model.state_dict())

In [None]:
# Linear regression in Numpy (a linear regression takes the shape of y = a + bx + e

a = np.random.randn(1,1)
b = np.random.randn(1,1)

lr = 1e-1
epochs = 1000

for epoch in range(1,epochs):
    # Forward pass - compute the model predictions
    y_hat = a + b * x_train
    
    # Compute the loss
    loss = 1/len(x_train)*np.sum((y_train-y_hat)**2)
    
    # Compute the gradients
    grad_a = -2*1/len(x_train)*np.sum(y_train-y_hat)
    grad_b = -2*1/len(x_train)*np.sum(x_train*(y_train-y_hat))
    
    # Update the parameters
    a = a-lr*grad_a
    b = b-lr*grad_b
    
print(a,b)

In [None]:
# Sanity Check: do we get the same results as our gradient descent?
from sklearn.linear_model import LinearRegression
linr = LinearRegression()
linr.fit(x_train, y_train)
print(linr.intercept_, linr.coef_[0])

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
#from torchviz import make_dot

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Our data was in Numpy arrays, but we need to transform them into PyTorch's Tensors
# and then we send them to the chosen device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

# Here we can see the difference - notice that .type() is more useful
# since it also tells us WHERE the tensor is (device)
print(type(x_train), type(x_train_tensor), x_train_tensor.type())

In [None]:
# FIRST
# Initializes parameters "a" and "b" randomly, ALMOST as we did in Numpy
# since we want to apply gradient descent on these parameters, we need
# to set REQUIRES_GRAD = TRUE
a = torch.randn(1, requires_grad=True, dtype=torch.float)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
print(a, b)

# SECOND
# But what if we want to run it on a GPU? We could just send them to device, right?
a = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
b = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
print(a, b)
# Sorry, but NO! The to(device) "shadows" the gradient...

# THIRD
# We can either create regular tensors and send them to the device (as we did with our data)
a = torch.randn(1, dtype=torch.float).to(device)
b = torch.randn(1, dtype=torch.float).to(device)
# and THEN set them as requiring gradients...
a.requires_grad_()
b.requires_grad_()
print(a, b)

In [None]:
# Reimplementation of linear regression using Torch

import torch
import torch.optim as optim
import torch.nn as nn

lr = 1e-1
epochs = 1000

device = 'cuda' if torch.cuda.is_available() else 'cpu'

a = torch.randn(1, requires_grad = True, device = device)
b = torch.randn(1, requires_grad = True, device = device)

x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

for epoch in range(1, epochs):
    y_hat = a+b*x_train_tensor
    error = y_train_tensor - y_hat
    loss = (error**2).mean()
    loss.backward()
    print(a.grad)
    print(b.grad)
    with torch.no_grad():
        a -= lr * a.grad
        b -= lr * b.grad
    a.grad.zero_()
    b.grad.zero_()
    
print(a,b)

In [None]:
import torchviz

torchviz.make_dot(loss)

In [None]:
# Creating a dataset
# Notice that for simple dataset comprised of two tensors, the pre-built
# TensorDataset class is already enough

from torch.utils.data import Dataset, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, x_train, y_train):
        self.x = x_train
        self.y = y_train
        
    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])
    
    def __len__(self):
        return len(self.x)
    
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()
    
train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

#train_data = TensorDataset(x_train_tensor, y_train_tensor)
#print(train_data[0])

In [None]:
# Creating a data loader

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)

next(iter(train_loader))

In [10]:
# Putting it all together: Dataset class, dataloader, splitting,
# and linear regression using a sequential model

import numpy as np

# Data Generation
np.random.seed(42)
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

torch.manual_seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class CustomDataset(Dataset):
    def __init__(self, x_train, y_train):
        self.x = x_train
        self.y = y_train
        
    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])
    
    def __len__(self):
        return len(self.x)
    
x_tensor = torch.from_numpy(x).float() # Since we are building the dataset, let's not send it to the GPU.
y_tensor = torch.from_numpy(y).float() # In a real-world scenario, thhis is advisable to save GPU RAM
    
dataset = CustomDataset(x_tensor, y_tensor)

train_data, eval_data = random_split(dataset, [80,20])

train_loader = DataLoader(dataset=train_data, batch_size=16)
eval_loader = DataLoader(dataset=eval_data, batch_size=20)

def make_train_step(optimizer, loss_fn, model):
    def train_step(x, y):
        optimizer.zero_grad()
        model.train()
        y_hat = model(x)
        loss = loss_fn(y_hat, y)
        loss.backward()
        optimizer.step()
        return loss.item()
    return train_step

lr = 1e-1
epochs = 1000

model = nn.Sequential(OrderedDict([('linear',nn.Linear(1,1,bias=True))])).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

loss_fn = nn.MSELoss(reduction='mean')

losses = []

train_step = make_train_step(optimizer, loss_fn, model)

for epoch in range(1,epochs):
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        losses.append(loss)

print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9625]], device='cuda:0')), ('linear.bias', tensor([1.0147], device='cuda:0'))])


In [16]:
# Evalution (basically the same as training, but without computing
# gradients and updating parameters

val_losses = []

with torch.no_grad():
    for x_batch, y_batch in eval_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        model.eval()
        y_hat = model(x_batch)
        val_loss = loss_fn(y_batch,y_hat)
        val_losses.append(val_loss)
        
print(model.state_dict())
print()
print("Mean error: "+str(np.mean(losses)))

OrderedDict([('linear.weight', tensor([[1.9625]], device='cuda:0')), ('linear.bias', tensor([1.0147], device='cuda:0'))])

Mean error: 0.014371141255949741
