# Programming for Data Science and Artificial Intelligence

## Deep Learning -  PyTorch I - Introduction and Linear Regression

- [WEIDMAN] Ch7
- https://pytorch.org/tutorials/
- https://github.com/yunjey/pytorch-tutorial

Here we introduce PyTorch, an increasingly popular neural network framework based on **automatic differentiation**.

### 1. Tensor basics

In [1]:
import torch
import numpy as np
import sys

In [2]:
torch.__version__

'1.9.1'

In [3]:
#Converting NumPy arrays to PyTorch tensors
arr = np.array([1,2,3,4,5])
x = torch.from_numpy(arr)
print(x)
print(x.dtype)

tensor([1, 2, 3, 4, 5])
torch.int64


In [4]:
# Print the type of data held by the tensor
print("Type: ", x.dtype)

# Print the shape
print("Shape: ", x.shape)

# Same as shape
print("Size", x.size())

# Print whether cpu or gpu handles
print("Device: ", x.device)

# Good for printing
print(type(x.data))

Type:  torch.int64
Shape:  torch.Size([5])
Size torch.Size([5])
Device:  cpu
<class 'torch.Tensor'>


In [5]:
#We can check whether we have gpu
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print("Device: ", device)

Device:  cpu


In [6]:
# Using torch.from_numpy() - share memory
arr = np.arange(0,5)
t = torch.from_numpy(arr)
print(t)

arr[2]=77
print(t)

tensor([0, 1, 2, 3, 4])
tensor([ 0,  1, 77,  3,  4])


In [7]:
# Using torch.tensor() - do not share memory
arr = np.arange(0,5)
t = torch.tensor(arr)
print(t)

arr[2]=77
print(t)

tensor([0, 1, 2, 3, 4])
tensor([0, 1, 2, 3, 4])


In [8]:
data = np.array([1,2,3])

In [9]:
a = torch.Tensor(data)  # do not copy the dtype of source
print(a, a.type())

tensor([1., 2., 3.]) torch.FloatTensor


In [10]:
b = torch.tensor(data) # follow dtype of source
print(b, b.type())

tensor([1, 2, 3]) torch.LongTensor


In [11]:
#changing type
print('Old:', b.type())
b = b.type(torch.float64)
print('New:', b.type())

Old: torch.LongTensor
New: torch.DoubleTensor


### 2. Creating tensors from scratch

In [12]:
x = torch.empty(4, 3)
print(x)

tensor([[9.3302e-33, 4.5822e-41, 9.9492e-44],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 7.7052e+31, 1.9447e+31]])


In [13]:
x = torch.zeros(4, 3, dtype=torch.int64)
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [14]:
x = torch.ones(4, 4)
print(x)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])


In [15]:
#torch.arange(start,end,step) - end is exclusive
x = torch.arange(0,18,2).reshape(3,3)
print(x)

tensor([[ 0,  2,  4],
        [ 6,  8, 10],
        [12, 14, 16]])


In [16]:
#torch.linspace(start,end,step) - end is inclusive
x = torch.linspace(0,18,12).reshape(3,4)
print(x)

tensor([[ 0.0000,  1.6364,  3.2727,  4.9091],
        [ 6.5455,  8.1818,  9.8182, 11.4545],
        [13.0909, 14.7273, 16.3636, 18.0000]])


In [17]:
#returns random samples from a uniform distribution over [0, 1)
x = torch.rand(4, 3)
print(x)

tensor([[0.0363, 0.3914, 0.4867],
        [0.8551, 0.3482, 0.1960],
        [0.2901, 0.2205, 0.4796],
        [0.8842, 0.3104, 0.8531]])


In [18]:
#returns samples from the "standard normal" distribution [σ = 1]
x = torch.randn(4, 3)
print(x)

tensor([[-1.0350, -0.2010, -0.4412],
        [ 0.6903, -1.1972,  0.2232],
        [-0.6363,  0.2769, -0.9859],
        [ 0.7911,  0.9866,  0.8692]])


In [19]:
#returns random integers from low (inclusive) to high (exclusive)
x = torch.randint(0, 5, (4, 3))
print(x)

tensor([[2, 4, 1],
        [2, 1, 1],
        [4, 1, 4],
        [0, 2, 0]])


<a href='https://pytorch.org/docs/stable/torch.html#torch.rand_like'><strong><tt>torch.rand_like(input)</tt></strong></a><br>
<a href='https://pytorch.org/docs/stable/torch.html#torch.randn_like'><strong><tt>torch.randn_like(input)</tt></strong></a><br>
<a href='https://pytorch.org/docs/stable/torch.html#torch.randint_like'><strong><tt>torch.randint_like(input,low,high)</tt></strong></a><br> these return random number tensors with the same size as <tt>input</tt>

In [20]:
x = torch.zeros(2,5)
print(x)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


In [21]:
x2 = torch.randn_like(x)
print(x2)

tensor([[-0.2510,  0.5824,  1.7459, -1.3541,  0.6830],
        [-1.9817,  0.3556, -0.6023,  0.0152,  1.4688]])


The same syntax can be used with<br>
<a href='https://pytorch.org/docs/stable/torch.html#torch.zeros_like'><strong><tt>torch.zeros_like(input)</tt></strong></a><br>
<a href='https://pytorch.org/docs/stable/torch.html#torch.ones_like'><strong><tt>torch.ones_like(input)</tt></strong></a>

In [22]:
x3 = torch.ones_like(x2)
print(x3)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])


In [23]:
x4 = torch.zeros_like(x3)
print(x4)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


### 3. Understanding backpropagation

In [24]:
import torch 

# Create tensors.
# only tensors of floating point dtype can get gradient
x = torch.tensor(2., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

# Build a computational graph.
y = w * x + b    # y = 2 * x + 3

# Compute gradients
# Pytorch tensor can automatically compute the derivative
# of the parameters in respect to loss
y.backward()

# Print out the gradients.
print("Gradient of x: ", x.grad)    # x.grad = 2 
print("Gradient of w: ", w.grad)    # w.grad = 1 
print("Gradient of b: ", b.grad)    # b.grad = 1 

Gradient of x:  tensor(2.)
Gradient of w:  tensor(2.)
Gradient of b:  tensor(1.)


In [25]:
#more complicated example
x = torch.tensor([[1.,2,3],[3,2,1]], requires_grad=True)
print("X", x)

y = 3*x + 2  #thus dy/dx = 3
print(y)

z = 2*y**2  #thus dz/dy = 4y or 4(3x + 2)
print(z)

out = z.mean()  #thus do/dz = 1/6
print(out)

out.backward()
print(x.grad)  #thus do/dz = do/dz * dz/dy * dy/dx = 1/6 * 4 (3x + 2) * 3  (Try x = 1, you will get 10)

X tensor([[1., 2., 3.],
        [3., 2., 1.]], requires_grad=True)
tensor([[ 5.,  8., 11.],
        [11.,  8.,  5.]], grad_fn=<AddBackward0>)
tensor([[ 50., 128., 242.],
        [242., 128.,  50.]], grad_fn=<MulBackward0>)
tensor(140., grad_fn=<MeanBackward0>)
tensor([[10., 16., 22.],
        [22., 16., 10.]])


## 4. Case Study: Linear Regression

Let's have linear regression as a case study to study the different components of pyTorch.  These are the following components we will be covering:

1. Specifying input and target
2. Dataset and DataLoader
3. nn.Linear (Dense)
4. Define loss function
5. Define optimizer function
6. Train the model

Consider this data:

<img src = "../figures/japan.png" width="400">

In a linear regression model, each target variable is estimated to be a weighted sum of the input variables, offset by some constant, known as a bias :

$$\text{yield}_\text{apple}  = w_{11} * \text{temp} + w_{12} * \text{rainfall} + w_{13} * \text{humidity} + b_{1}$$

$$\text{yield}_\text{orange} = w_{21} * \text{temp} + w_{22} * \text{rainfall} + w_{23} * \text{humidity} + b_{2}$$

Visually, it means that the yield of apples is a linear or planar function of temperature, rainfall and humidity:

<img src = "../figures/japan2.png" width="400">

The learning part of linear regression is to figure out a set of weights <code>w11, w12,... w23, b1 \& b2</code> using gradient descent

#### 1. Specifiying input and target

In [26]:
# Input (temp, rainfall, humidity)
x_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], 
                   [102, 43, 37], [69, 96, 70], [73, 67, 43], 
                   [91, 88, 64], [87, 134, 58], [102, 43, 37], 
                   [69, 96, 70], [73, 67, 43], [91, 88, 64], 
                   [87, 134, 58], [102, 43, 37], [69, 96, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
y_train = np.array([[56, 70], [81, 101], [119, 133], 
                    [22, 37], [103, 119], [56, 70], 
                    [81, 101], [119, 133], [22, 37], 
                    [103, 119], [56, 70], [81, 101], 
                    [119, 133], [22, 37], [103, 119]], 
                   dtype='float32')

inputs = torch.from_numpy(x_train)
targets = torch.from_numpy(y_train)
print(inputs.size())
print(targets.size())

torch.Size([15, 3])
torch.Size([15, 2])


#### 2. Dataset and DataLoader

We'll create a <code>TensorDataset</code>, which allows access to rows from inputs and targets as tuples, and if we want to use <code>DataLoader</code> (will talk shortly) from numpy array, we have to first make <code>TensorDataset</code>.

In [27]:
from torch.utils.data import TensorDataset

In [28]:
# Define dataset
train_ds = TensorDataset(inputs, targets)
train_ds[0:3]

(tensor([[ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.]]),
 tensor([[ 56.,  70.],
         [ 81., 101.],
         [119., 133.]]))

We'll now create a <code>DataLoader</code>, which can split the data into batches of a predefined size while training. It also provides other utilities like shuffling and random sampling of the data.

In [29]:
from torch.utils.data import DataLoader

In [30]:
# Define data loader
batch_size = 3
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

The data loader is typically used in a for-in loop. Let's look at an example

In [31]:
for xb, yb in train_dl:
    print(xb)
    print(yb)
    break

tensor([[ 87., 134.,  58.],
        [102.,  43.,  37.],
        [102.,  43.,  37.]])
tensor([[119., 133.],
        [ 22.,  37.],
        [ 22.,  37.]])


In each iteration, the data loader returns one batch of data, with the given batch size. If shuffle is set to True, it shuffles the training data before creating batches. Shuffling helps randomize the input to the optimization algorithm, which can lead to faster reduction in the loss.

#### 3. Define some layer - nn.Linear

Instead of initializing the weights & biases manually, we can define the model using the <code>nn.Linear</code> class from PyTorch, which does it automatically.

In [32]:
import torch.nn as nn

# Define model
model = nn.Linear(3, 2)  #nn.Linear assume this shape (in_features, out_features)
print(model.weight)
print(model.weight.size()) # (out_features, in_features)
print(model.bias)
print(model.bias.size()) #(out_features)

Parameter containing:
tensor([[ 0.3826, -0.4897,  0.4299],
        [ 0.5745, -0.3261,  0.3227]], requires_grad=True)
torch.Size([2, 3])
Parameter containing:
tensor([0.0843, 0.3305], requires_grad=True)
torch.Size([2])


In fact, our model is simply a function that performs a matrix multiplication of the <code>inputs</code> and the weights <code>w</code> and adds the bias <code>b</code> (for each observation)

<img src = "../figures/dot.png" width="400">

PyTorch models also have a helpful <code>.parameters</code> method, which returns a list containing all the weights and bias matrices present in the model. For our linear regression model, we have one weight matrix and one bias matrix.

In [33]:
# Parameters
list(model.parameters())  #model.param returns a generator

[Parameter containing:
 tensor([[ 0.3826, -0.4897,  0.4299],
         [ 0.5745, -0.3261,  0.3227]], requires_grad=True),
 Parameter containing:
 tensor([0.0843, 0.3305], requires_grad=True)]

In [34]:
#we can print the complexity by the number of parameters
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

8


We can use the <code>model(tensor)</code> API to perform a forward-pass that generate predictions

In [35]:
# Generate predictions
preds = model(inputs)
preds

tensor([[13.6913, 34.2938],
        [19.3222, 44.5626],
        [-7.3135, 25.3280],
        [33.9617, 56.8436],
        [ 9.5656, 31.2517],
        [13.6913, 34.2938],
        [19.3222, 44.5626],
        [-7.3135, 25.3280],
        [33.9617, 56.8436],
        [ 9.5656, 31.2517],
        [13.6913, 34.2938],
        [19.3222, 44.5626],
        [-7.3135, 25.3280],
        [33.9617, 56.8436],
        [ 9.5656, 31.2517]], grad_fn=<AddmmBackward>)

#### 4. Define loss function

The <code>nn</code> module contains a lot of useful loss function like this:

In [36]:
criterion_mse = nn.MSELoss()
criterion_softmax_cross_entropy_loss = nn.CrossEntropyLoss()

In [37]:
mse = criterion_mse(preds, targets)
print(mse)
print(mse.item())  ##print out the loss number

tensor(5456.9263, grad_fn=<MseLossBackward>)
5456.92626953125


#### 5. Define the optimizer

We use <code>optim.SGD</code> to perform stochastic gradient descent where samples are selected in batches (often with random shuffling) instead of as a single group.  Note that <code>model.parameters()</code> is passed as an argument to <code>optim.SGD</code>.

In [38]:
# Define optimizer
#momentum update the weight based on past gradients also, which will be useful for getting out of local max/min
#If our momentum parameter was $0.9$, we would get our current grad + the multiplication of the gradient 
#from one time step ago by $0.9$, the one from two time steps ago by $0.9^2 = 0.81$, etc.
opt = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) 

#### 6. Training - putting everything together

In [39]:
# Utility function to train the model
def fit(num_epochs, model, loss_fn, opt, train_dl):
    
    # Repeat for given number of epochs
    for epoch in range(num_epochs):
        
        # Train with batches of data
        for xb,yb in train_dl:
            
            xb.to(device) #move them to gpu if possible, if not, it will be cpu
            yb.to(device)
                    
            # 1. Predict
            pred = model(xb)
                      
            # 2. Calculate loss
            loss = loss_fn(pred, yb)
            
            # 3. Calculate gradient
            opt.zero_grad()  #if not, the gradients will accumulate
            loss.backward()
            
            # Print out the gradients.
            #print ('dL/dw: ', model.weight.grad) 
            #print ('dL/db: ', model.bias.grad)
            
            # 4. Update parameters using gradients
            opt.step()
            
        # Print the progress
        if (epoch+1) % 10 == 0:
            sys.stdout.write("\rEpoch [{}/{}], Loss: {:.4f}".format(epoch+1, num_epochs, loss.item()))

In [40]:
#train for 100 epochs
fit(100, model, criterion_mse, opt, train_dl)

Epoch [100/100], Loss: 19.90921

In [41]:
# Generate predictions
preds = model(inputs)
loss = criterion_mse(preds, targets)
print(loss.item())

48.98768997192383


### Practice

- Try to play around, change some neurons, and see what happens
- Plot the model line using <code>model.linear.weight.item()</code> and <code>model.linear.bias.item()</code>
- Try to load the boston dataset and learn to map the data to tensorDataset
- Try to plot the loss over time