# Tensor Basics

In [1]:
import torch

In [20]:
x = torch.empty(2,2,3)
print(x)

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00, -1.4176e-13,  3.0955e-41]],

        [[-3.1550e-14,  3.0955e-41, -8.3674e+02],
         [ 4.5824e-41,  1.4013e-45,  0.0000e+00]]])


In [21]:
x = torch.rand(2,2)
print(x)

tensor([[0.4064, 0.0592],
        [0.1780, 0.4562]])


In [22]:
x = torch.zeros(2,2)
print(x)

tensor([[0., 0.],
        [0., 0.]])


In [23]:
x = torch.ones(2,2)
print(x)

tensor([[1., 1.],
        [1., 1.]])


### Tensor Details

In [35]:
x = torch.ones(2,2,dtype=torch.float16)
print("Tensor dtype:", x.dtype)
print("Tensor size:", x.size())

Tensor dtype: torch.float16
Tensor size: torch.Size([2, 2])


In [36]:
x = torch.tensor([2.5, 0.1])
print(x)

tensor([2.5000, 0.1000])


### Tensor Operations

In [55]:
x = torch.randint(0, 5, (2,))
y = torch.randint(0, 5, (2,))

In [56]:
print("x: ", x)
print("y:", y)

x:  tensor([1, 4])
y: tensor([3, 3])


In [58]:
# z = x + y
z = torch.add(x, y)
print(z)

tensor([4, 7])


In [59]:
# In PyTorch, trailing underscore signifies in-place op.
y.add_(x)
print(y)

tensor([4, 7])


In [60]:
print("x: ", x)
print("y:", y)

x:  tensor([1, 4])
y: tensor([4, 7])


In [63]:
# z = x - y
z = torch.sub(x, y)
print(z)

tensor([-3, -3])


In [64]:
y.sub_(x)
print(y)

tensor([3, 3])


In [65]:
# z = x * y
z = torch.mul(x, y)
print(z)

tensor([ 3, 12])


In [66]:
y.mul_(x)
print(y)

tensor([ 3, 12])


In [68]:
print("x: ", x)
print("y:", y)

x:  tensor([1, 4])
y: tensor([ 3, 12])


### Advanced Operations

**Slicing tensors**

In [71]:
x = torch.rand(5, 3)
print(x)

tensor([[0.8841, 0.9751, 0.9234],
        [0.2452, 0.7758, 0.5056],
        [0.4158, 0.3944, 0.2258],
        [0.6247, 0.0989, 0.0473],
        [0.6611, 0.8358, 0.1755]])


In [72]:
print(x[:])

tensor([[0.8841, 0.9751, 0.9234],
        [0.2452, 0.7758, 0.5056],
        [0.4158, 0.3944, 0.2258],
        [0.6247, 0.0989, 0.0473],
        [0.6611, 0.8358, 0.1755]])


In [74]:
print(x[0, :])

tensor([0.8841, 0.9751, 0.9234])


In [73]:
print(x[:, 0])

tensor([0.8841, 0.2452, 0.4158, 0.6247, 0.6611])


In [75]:
print(x[0, 0])

tensor(0.8841)


In [76]:
print(x[0, 0].item())

0.8840826749801636


**Reshaping tensors**

In [86]:
x = torch.rand(2,3)
print(x)

tensor([[0.9742, 0.0987, 0.7310],
        [0.4212, 0.1334, 0.1215]])


In [88]:
y = x.view(6)
print("y: ", y)
print("size: ", y.size())

y:  tensor([0.9742, 0.0987, 0.7310, 0.4212, 0.1334, 0.1215])
size:  torch.Size([6])


In [89]:
# Putting -1 in place of a dimension tells PyTorch to infer
y = x.view(-1, 3)
print("y: ", y)
print("size: ", y.size())

y:  tensor([[0.9742, 0.0987, 0.7310],
        [0.4212, 0.1334, 0.1215]])
size:  torch.Size([2, 3])


**NumPy and PyTorch Tensors**

In [90]:
import numpy as np

In [103]:
a = torch.ones(5)
print("a:", a)
print("a type: ", type(a))
# If Tensor is on CPU, both objects share same memory location
b = a.numpy()
print("b:", b)
print("b type: ", type(b))

a: tensor([1., 1., 1., 1., 1.])
a type:  <class 'torch.Tensor'>
b: [1. 1. 1. 1. 1.]
b type:  <class 'numpy.ndarray'>


In [102]:
a = np.ones(5)
print("a:", a)
print("a type: ", type(a))
b = torch.from_numpy(a)
print("b:", b)
print("b type: ", type(b))

a: [1. 1. 1. 1. 1.]
a type:  <class 'numpy.ndarray'>
b: tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
b type:  <class 'torch.Tensor'>


### CPU & GPU

In [105]:
if torch.cuda.is_available():
    print(True)

True


In [106]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    x = torch.ones(5, device=device)
    y = torch.ones(5)
    y = y.to(device)
    z = x + y
    print(z)

tensor([2., 2., 2., 2., 2.], device='cuda:0')


In [108]:
# However, numpy can only handle CPU tensors
z.numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [109]:
z = z.to("cpu")
z.numpy()

array([2., 2., 2., 2., 2.], dtype=float32)

<br><br><br>
## Gradient Calculation w/ Autograd

In [111]:
# Tell PyTorch to create a computational graph that tracks ops
x = torch.randn(3, requires_grad=True)
print(x)

tensor([ 1.4170,  0.9041, -2.7062], requires_grad=True)


In [112]:
y = x + 2
print(y)

tensor([ 3.4170,  2.9041, -0.7062], grad_fn=<AddBackward0>)


In [117]:
z = y * 2
print(z)

tensor([ 6.8340,  5.8083, -1.4124], grad_fn=<MulBackward0>)


In [116]:
# z = z.mean()
# print(z)

In [119]:
# Backward fn needs a gradient vector for multiplication w/
# Jacobian matrix of partial derivatives
v = torch.tensor([0.1, 1.0, 0.001], dtype=torch.float32)
z.backward(v)
print(x.grad)

tensor([0.8667, 2.6667, 0.6687])


**Turning off gradient tracking**

In [122]:
x.requires_grad_(True)
x.requires_grad_(False)
print(x)

tensor([ 1.4170,  0.9041, -2.7062])


In [124]:
x.requires_grad_(True)
y = x.detach()
print("x: ", x)
print("y: ", y)

x:  tensor([ 1.4170,  0.9041, -2.7062], requires_grad=True)
y:  tensor([ 1.4170,  0.9041, -2.7062])


In [127]:
y = x + 2
print(y)

tensor([ 3.4170,  2.9041, -0.7062], grad_fn=<AddBackward0>)


In [128]:
with torch.no_grad():
    y = x + 2
    print(y)

tensor([ 3.4170,  2.9041, -0.7062])


**Gradient accumulation**

In [129]:
# Whenever backward function is called, gradients are accumulated
# In the .grad attribute

weights = torch.ones(4, requires_grad=True)

# Incorrect!
for epoch in range(1):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3.])


In [130]:
# Incorrect!
for epoch in range(1):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)

tensor([6., 6., 6., 6.])


In [131]:
# Incorrect!
for epoch in range(1):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)

tensor([9., 9., 9., 9.])


In [133]:
# Correct!
for epoch in range(1):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])


In [134]:
# Correct!
for epoch in range(1):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])


<br><br><br>
## Backpropagation

In [None]:
# Compute loss (forward), 
# compute local gradients, 
# compute weight updates (backward)

In [135]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)

w = torch.tensor(1.0, requires_grad=True)

In [137]:
# forward pass
y_hat = w * x
loss = (y_hat - y)**2
print(loss)

tensor(1., grad_fn=<PowBackward0>)


In [138]:
# backward pass
loss.backward()
print(w.grad)

tensor(-2.)


In [None]:
# update weights, next forward & backward pass 

<br><br><br>
## Gradient Descent (Manual)

In [139]:
# Optimizing model using automatic gradient computation (autograd)
# Implement gradient descent (linear regression) steps manually
# Prediction and loss function
# Numerical calculation of gradients
# Gradient descent to optimize parameters
# Replace everything with autograd and pytorch

In [141]:
import numpy as np

In [153]:
# linear combination of weights, inputs: f = w * x
X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([2, 4, 6, 8], dtype=np.float32)

# Initialize weights
w = 0.0

# Model prediction
def forward(x):
    return w * x

# MSE loss
def loss(y, y_pred):
    return ((y_pred - y)**2).mean()

# Gradient
# MSE = 1/N * (w*x - y)**2
# dJ/dw = 1/N * 2x * (w*x - y)
def grad(x, y, y_pred):
    return np.dot(2*x, (y_pred-y)).mean()

In [154]:
print(f'Prediction before training: f(5) = {forward(5):.3f}')

# Training
learning_rate = 0.01
num_epochs = 10

for epoch in range(num_epochs):
    y_pred = forward(X)
    l = loss(Y, y_pred)
    dw = grad(X, Y, y_pred)
    w -= learning_rate * dw
    
    if epoch % 1 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000
epoch 1: w = 1.200, loss = 30.00000000
epoch 2: w = 1.680, loss = 4.79999924
epoch 3: w = 1.872, loss = 0.76800019
epoch 4: w = 1.949, loss = 0.12288000
epoch 5: w = 1.980, loss = 0.01966083
epoch 6: w = 1.992, loss = 0.00314574
epoch 7: w = 1.997, loss = 0.00050331
epoch 8: w = 1.999, loss = 0.00008053
epoch 9: w = 1.999, loss = 0.00001288
epoch 10: w = 2.000, loss = 0.00000206
Prediction after training: f(5) = 9.999


<br><br><br>
## Gradient Descent (PyTorch Gradients)

In [156]:
import torch

In [162]:
# linear combination of weights, inputs: f = w * x
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# Initialize weights
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

In [163]:
# Model prediction
def forward(x):
    return w * x

# MSE loss
def loss(y, y_pred):
    return ((y_pred - y)**2).mean()

In [164]:
print(f'Prediction before training: f(5) = {forward(5):.3f}')

# Training
learning_rate = 0.01
num_epochs = 50

for epoch in range(num_epochs):
    y_pred = forward(X)
    l = loss(Y, y_pred)
    
    # Replace manual gradient calculation w/ PyTorch backward pass
    l.backward()

    # Turn off computational graph for weight updates
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # Set gradients to zero
    w.grad.zero_()
    
    if epoch % 5 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 6: w = 1.246, loss = 5.90623236
epoch 11: w = 1.665, loss = 1.16278565
epoch 16: w = 1.851, loss = 0.22892261
epoch 21: w = 1.934, loss = 0.04506890
epoch 26: w = 1.971, loss = 0.00887291
epoch 31: w = 1.987, loss = 0.00174685
epoch 36: w = 1.994, loss = 0.00034392
epoch 41: w = 1.997, loss = 0.00006770
epoch 46: w = 1.999, loss = 0.00001333
Prediction after training: f(5) = 9.997


<br><br><br>
## Gradient Descent (PyTorch Loss, Optimization)

In [166]:
# 1) Design models (input size, output size, forward pass layers)
# 2) Construct loss, optimizer
# 3) Construct training loop: 
    # forward (preds), backward (grads), update (weights)

In [178]:
import torch
import torch.nn as nn

In [179]:
# linear combination of weights, inputs: f = w * x
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# Initialize weights
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

In [180]:
# Model prediction
def forward(x):
    return w * x

In [181]:
print(f'Prediction before training: f(5) = {forward(5):.3f}')

# Training
learning_rate = 0.01
num_epochs = 50

# Define loss, optimizer from PyTorch
mse_loss = nn.MSELoss()
optimizer = torch.optim.SGD([w], lr=learning_rate)

for epoch in range(num_epochs):
    y_pred = forward(X)
    l = mse_loss(Y, y_pred)
    
    # Replace manual gradient calculation w/ PyTorch backward pass
    l.backward()

    # Use optimizer instead of manually updating weights
    optimizer.step()
        
    # Set optimizer gradients to zero
    optimizer.zero_grad()
    
    if epoch % 5 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 6: w = 1.246, loss = 5.90623236
epoch 11: w = 1.665, loss = 1.16278565
epoch 16: w = 1.851, loss = 0.22892261
epoch 21: w = 1.934, loss = 0.04506890
epoch 26: w = 1.971, loss = 0.00887291
epoch 31: w = 1.987, loss = 0.00174685
epoch 36: w = 1.994, loss = 0.00034392
epoch 41: w = 1.997, loss = 0.00006770
epoch 46: w = 1.999, loss = 0.00001333
Prediction after training: f(5) = 9.997


<br><br><br>
## Gradient Descent (PyTorch Model, Forward Pass)

In [182]:
# linear combination of weights, inputs: f = w * x
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)

# No need to initialize weights
# No need to define forward pass

In [183]:
n_samples, n_features = X.shape
print(n_samples, n_features)

input_size = n_features
output_size = n_features

# Define model
model = nn.Linear(input_size, output_size)

4 1


In [184]:
X_test = torch.tensor([5], dtype=torch.float32)

In [185]:
print(f'Prediction before training: f(5) = {model(X_test).item():.3f}')

# Training
learning_rate = 0.01
num_epochs = 50

# Define loss, optimizer from PyTorch
mse_loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    y_pred = model(X)
    l = mse_loss(Y, y_pred)
    
    # Replace manual gradient calculation w/ PyTorch backward pass
    l.backward()

    # Use optimizer instead of manually updating weights
    optimizer.step()
        
    # Set optimizer gradients to zero
    optimizer.zero_grad()
    
    if epoch % 5 == 0:
        [w, b] = model.parameters()
        print(f'epoch {epoch+1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {model(X_test).item():.3f}')

Prediction before training: f(5) = 4.221
epoch 1: w = 1.014, loss = 9.98511600
epoch 6: w = 1.532, loss = 1.62430954
epoch 11: w = 1.741, loss = 0.27894422
epoch 16: w = 1.825, loss = 0.06202174
epoch 21: w = 1.860, loss = 0.02662332
epoch 26: w = 1.876, loss = 0.02043771
epoch 31: w = 1.883, loss = 0.01896561
epoch 36: w = 1.887, loss = 0.01826570
epoch 41: w = 1.889, loss = 0.01770367
epoch 46: w = 1.891, loss = 0.01717711
Prediction after training: f(5) = 9.778


In [187]:
[w, b] = model.parameters()

In [193]:
print(f'model weights: {w}\n')
print(f'model bias: {b}')

model weights: Parameter containing:
tensor([[1.8924]], requires_grad=True)

model bias: Parameter containing:
tensor([0.3159], requires_grad=True)


<br><br><br>
## Gradient Descent (Custom Linear Regression)

In [194]:
# linear combination of weights, inputs: f = w * x
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)

# No need to initialize weights
# No need to define forward pass

In [198]:
n_samples, n_features = X.shape
print(n_samples, n_features)

input_size = n_features
output_size = n_features

4 1


In [200]:
X_test = torch.tensor([5], dtype=torch.float32)

In [197]:
class LinearRegression(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        
        # define layers
        self.lin = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        return self.lin(x)

In [199]:
# Define model
model = LinearRegression(input_size, output_size)  

In [201]:
print(f'Prediction before training: f(5) = {model(X_test).item():.3f}')

# Training
learning_rate = 0.01
num_epochs = 50

# Define loss, optimizer from PyTorch
mse_loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    y_pred = model(X)
    l = mse_loss(Y, y_pred)
    
    # Replace manual gradient calculation w/ PyTorch backward pass
    l.backward()

    # Use optimizer instead of manually updating weights
    optimizer.step()
        
    # Set optimizer gradients to zero
    optimizer.zero_grad()
    
    if epoch % 5 == 0:
        [w, b] = model.parameters()
        print(f'epoch {epoch+1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {model(X_test).item():.3f}')

Prediction before training: f(5) = 0.096
epoch 1: w = 0.429, loss = 31.53911972
epoch 6: w = 1.348, loss = 5.07474899
epoch 11: w = 1.716, loss = 0.81793153
epoch 16: w = 1.865, loss = 0.13317804
epoch 21: w = 1.924, loss = 0.02298848
epoch 26: w = 1.949, loss = 0.00521830
epoch 31: w = 1.959, loss = 0.00231509
epoch 36: w = 1.963, loss = 0.00180450
epoch 41: w = 1.965, loss = 0.00168008
epoch 46: w = 1.966, loss = 0.00161900
Prediction after training: f(5) = 9.931
