# Introduction to PyTorch

Some exploration into using PyTorch for Data Science. 

In [1]:
import numpy as np

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
# from torchviz import make_dot

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Linear Regression

- Create synthetic linear data
- Find the MSE loss
- Perform gradient descent to minimise the loss

### Create Data

In [4]:
np.random.seed(42)

# Data generation
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

idx = np.arange(100)
np.random.shuffle(idx)
train_idx = idx[:80]
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

### NumPy Implementation

In [5]:
# Random initialisation from normal distribution
a = np.random.randn(1)
b = np.random.randn(1)
print(a, b)

[-2.02514259] [0.18645431]


In [6]:
lr = 1e-1
n_epochs = 1000

for epoch in range(n_epochs):
    yhat = a + b * x_train
    error = (y_train - yhat)
    loss = (error ** 2).mean()
    
    # Computes gradients for both "a" and "b" parameters, chain rule of the MSE
    a_grad = -2 * error.mean()
    b_grad = -2 * (x_train * error).mean()
    
    # Updates parameters using gradients and the learning rate
    a = a - lr * a_grad
    b = b - lr * b_grad
    
print(a, b)

[1.02354078] [1.96896443]


In [7]:
# Compare sklearn linear regression to our gradient descent
from sklearn.linear_model import LinearRegression

linr = LinearRegression()
linr.fit(x_train, y_train)
print(linr.intercept_, linr.coef_[0])

[1.02354075] [1.96896447]


### Torch It

Time to use PyTorch tensors to do this.

Methods that end with `_` perform the operation inplace.

In [8]:
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)
print(type(x_train), type(x_train_tensor), x_train_tensor.type())

<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.FloatTensor


In [9]:
# Random initialisation from normal distribution, requires_grad since we use gradient descent
a = torch.randn(1, requires_grad=True, dtype=torch.float)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
print(a, b)

tensor([0.2647], requires_grad=True) tensor([0.6635], requires_grad=True)


In [10]:
# To run in a GPU we need to send to the device
a = torch.randn(1, dtype=torch.float).to(device)
b = torch.randn(1, dtype=torch.float).to(device)
a.requires_grad_()
b.requires_grad_()
print(a, b)

tensor([0.3973], requires_grad=True) tensor([-0.0471], requires_grad=True)


In [11]:
# Reinitialise and specify device
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)


In [12]:
lr = 1e-1
n_epochs = 1000

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()
    
    # To compute the gradient PyTorch works backwards from the specified loss
    loss.backward() 

    # Need to keep the update out of the gradient computatiom
    # no_grad lets us perform regular Python operations on tensors, 
    # independent of PyTorch's dynamic computation graph
    with torch.no_grad():
        a -= lr * a.grad
        b -= lr * b.grad
    
    # Tell PyTorch to let go of its computed gradients
    a.grad.zero_()
    b.grad.zero_()
    
print(a, b)

tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


### PyTorch Optimizers

An optimizer updates the parameters and hyperparameters through its `step()` method, then zeros the gradients via `zero_grad()`. 

In [13]:
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)


In [14]:
lr = 1e-1
n_epochs = 1000

# Defines a SGD optimizer to update the parameters, Adam is another option
optimizer = optim.SGD([a, b], lr=lr)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()

    loss.backward()    
    
    optimizer.step()
    
    optimizer.zero_grad()
    
print(a, b)

tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


### PyTorch Loss Functions

This creates the loss function for us.

In [15]:
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)


In [16]:
lr = 1e-1
n_epochs = 1000

optimizer = optim.SGD([a, b], lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')  # can also use sum but this may be too large to work

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    
    loss = loss_fn(y_train_tensor, yhat)

    loss.backward()    
    optimizer.step()
    optimizer.zero_grad()
    
print(a, b)

tensor([1.0235], requires_grad=True) tensor([1.9690], requires_grad=True)


### Create a PyTorch Model

This means we can initialise and call the model multiples times. The key is the `forward` method which defines our prediction.

In [17]:
class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # To make "a" and "b" real parameters of the model, we need to wrap them with nn.Parameter
        self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        
    def forward(self, x):
        # Computes the outputs / predictions
        return self.a + self.b * x

In [18]:
torch.manual_seed(42)
# Parameters are initialised in the model
model = ManualLinearRegression().to(device)
print(model.state_dict())

OrderedDict([('a', tensor([0.3367])), ('b', tensor([0.1288]))])


In [19]:
lr = 1e-1
n_epochs = 1000

optimizer = optim.SGD(model.parameters(), lr=lr)
loss_fn = nn.MSELoss(reduction='mean')

for epoch in range(n_epochs):
    model.train()

    yhat = model(x_train_tensor)  # forward is the model prediction method
    
    loss = loss_fn(y_train_tensor, yhat)
    
    loss.backward()    
    optimizer.step()
    optimizer.zero_grad()
    
print(model.state_dict())

OrderedDict([('a', tensor([1.0235])), ('b', tensor([1.9690]))])


### Nested Model

Use PyTorch's linear model as our basis.

In [20]:
class LayerLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # Instead of custom parameters we use a Linear layer (ax + b) with single input and output
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        # Now it only takes a call to the layer to make predictions
        return self.linear(x)

In [21]:
torch.manual_seed(42)
# Parameters are initialised in the model
model = LayerLinearRegression().to(device)
print(model.state_dict())

OrderedDict([('linear.weight', tensor([[0.7645]])), ('linear.bias', tensor([0.8300]))])


In [22]:
lr = 1e-1
n_epochs = 1000

loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(n_epochs):
    model.train()

    yhat = model(x_train_tensor)  # forward is the model prediction method
    
    loss = loss_fn(y_train_tensor, yhat)
    loss.backward()    
    optimizer.step()
    optimizer.zero_grad()
    
print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9690]])), ('linear.bias', tensor([1.0235]))])
