# Introduction to Pytorch

A short introduction to training models with pytorch.

Check out the links at the bottom of the notebook for further information.

## What is Pytorch?

- Numpy on GPU
- Library for neural network components
- Automatic differentiation engine
- Library for optimization methods

### Numpy

In [2]:
import torch
import numpy as np

data = [[1, 2], [3, 4]]

In [6]:
np_x = np.array(data)
print(np_x, type(np_x))

[[1 2]
 [3 4]] <class 'numpy.ndarray'>


In [7]:
x = torch.tensor(data)
print(x, x.dtype)

tensor([[1, 2],
        [3, 4]]) torch.int64


Torch and numpy can be directly translated:

In [8]:
torch_x = torch.from_numpy(np_x)
print(torch_x)

tensor([[1, 2],
        [3, 4]])


They can share memory:

In [None]:
torch_x[0, 0] = 100
torch_x, np_x

Difference in returned values:

In [None]:
np_x = np.array(data)
print(x[0, 0], np_x[0, 0])

print(x[0, 0].item())

Operations work just like numpy:

In [None]:
print(x @ x.T)

print(np_x @ np_x.T)

You need the numpy array back?

In [None]:
x.numpy()

In [None]:
%%time
device = torch.device("cpu")
#device = torch.device("cuda:1")

l = 100
a = np.ones((l, l))

for _ in range(1000):
    a += a

a

### Training a model

5 steps:
1. forward
2. compute loss
3. zero gradients
4. compute new gradients (backwards)
5. optimization step

In [3]:
import matplotlib.pyplot as plt

In [None]:
# Generate some fake data
N = 1000
dims = 2
x = torch.randn(N, dims)
#y = (torch.linalg.norm(x, ord=2, dim=1))
y = (torch.linalg.norm(x, ord=2, dim=1) <= 1.2).long()
n_classes = 2

plt.figure(figsize=(7, 7))
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="blue")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="orange")

(y==1).sum()

In [None]:
# define a simple model
hidden_dim = 1000
input_dim = dims

# define a loss function
criterion = torch.nn.CrossEntropyLoss()

# define an optimizer
learning_rate = 0.001
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.0001
)

num_epochs = 3000
for i in range(num_epochs):
    
    # Step 1: forward pass
    y_pred = model(x)
    
    # Step 2: compute loss
    loss = criterion(y_pred, y)
    
    # (Compute accuracy)
    if i % 100 == 0:
        acc = (y_pred.argmax(dim=1) == y).sum() / len(y)
        print(f"Epoch {i}, {acc.item()=:.3f}")
    
    # Step 3: Zero the gradients
    optimizer.zero_grad()
    
    # Step 4: Backward pass to compute gradients
    loss.backward()
    
    # Step 5: Update params
    optimizer.step()

In [None]:
# Look at output
with torch.no_grad():
    y_pred = model(x)
    y_pred = (y_pred.argmax(axis=1))

fig, axes = plt.subplots(ncols=2, figsize=(12, 6))

axes[0].scatter(x[y == 0, 0], x[y == 0, 1], color="blue")
axes[0].scatter(x[y == 1, 0], x[y == 1, 1], color="orange")
axes[0].set_title("True classes")

axes[1].scatter(x[y_pred == 0, 0], x[y_pred == 0, 1], color="blue")
axes[1].scatter(x[y_pred == 1, 0], x[y_pred == 1, 1], color="orange")
axes[1].set_title("Predicted classes")

### What about GPU?

In [21]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
# Put data on device
x = x.to(device) # <-------------------------------
y = y.to(device) # <-------------------------------

# define a simple model
hidden_dim = 100
input_dim = dims

model = torch.nn.Sequential(
    torch.nn.Linear(input_dim, hidden_dim),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_dim, n_classes)
)
model = model.to(device)  # <-------------------------------

# define a loss function
criterion = torch.nn.CrossEntropyLoss()

# define an optimizer
learning_rate = 0.001
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.0001
)

num_epochs = 3000
for i in range(num_epochs):
    
    # Step 1: forward pass
    y_pred = model(x)
    
    # Step 2: compute loss
    loss = criterion(y_pred, y)
    
    # (Compute accuracy)
    if i % 100 == 0:
        acc = (y_pred.argmax(dim=1) == y).sum() / len(y)
        print(f"Epoch {i}, {acc.item()=:.3f}")
    
    # Step 3: Zero the gradients
    optimizer.zero_grad()
    
    # Step 4: Backward pass to compute gradients
    loss.backward()
    
    # Step 5: Update params
    optimizer.step()

### Custom datasets
https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataset

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, *args):
        # load data
        pass
    
    def __len__(self):
        pass
    
    def __getitem__(self, idx):
        pass  # data, target

In [None]:
class CircleDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        #return features, label
        return self.x[idx], self.y[idx]

In [None]:
# Define the dataset and dataloader
dataset = CircleDataset(x, y)
loader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)
dataset[0]

In [None]:
next(iter(loader))  # (10, 2),  (10)

In [30]:
# define a simple model
hidden_dim = 100
input_dim = dims

model = torch.nn.Sequential(
    torch.nn.Linear(input_dim, hidden_dim),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_dim, n_classes)
)
model = model.to(device)  # <-------------------------------

# define a loss function
criterion = torch.nn.CrossEntropyLoss()

# define an optimizer
learning_rate = 0.001
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.0001
)

num_epochs = 100
for i in range(num_epochs):
    correct = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        
        # Step 1: forward pass
        y_pred = model(x)

        # Step 2: compute loss
        loss = criterion(y_pred, y)

        # Step 3: Zero the gradients
        optimizer.zero_grad()

        # Step 4: Backward pass to compute gradients
        loss.backward()

        # Step 5: Update params
        optimizer.step()
        
        correct += (y_pred.argmax(dim=1) == y).sum()
    # (Compute accuracy)
    if i % 5 == 0:
        acc = correct / len(loader.dataset)
        print(f"Epoch {i}, {acc.item()=:.3f}")

NameError: name 'device' is not defined

### A more realistic example: MNIST

In [None]:
import torchvision.transforms as transforms
from torchvision.datasets import MNIST

In [None]:
# transform data in right format
transform = transforms.ToTensor()

train_dataset = MNIST("./mnist_data", train=True, download=True, transform=transform)
test_dataset = MNIST("./mnist_data", train=False, download=True, transform=transform)
print(len(train_dataset), len(test_dataset))

In [None]:
plt.imshow(train_dataset[0][0].reshape(28, 28))

In [None]:
# A lot of data so we want to work in batches on it
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False
)

In [None]:
import torch.nn.functional as F


# define more complex models as classes
class SimpleNet(torch.nn.Module):
    def __init__(self, image_size=(28, 28), hidden_dim=100, output_dim=10):
        super().__init__()
        self.input_dim = image_size[0] * image_size[1]
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.lin1 = torch.nn.Linear(self.input_dim, self.hidden_dim)
        self.lin2 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.lin3 = torch.nn.Linear(self.hidden_dim, self.output_dim)
        
    def forward(self, x):
        x = x.view(-1, self.input_dim)# with flatten x (work with just 1 vector)
        x = self.lin1(x)
        x = F.relu(x)      
        x = self.lin2(x)
        x = F.relu(x)
        x = self.lin3(x)
        return x

In [None]:
learning_rate = 1e-3

model = SimpleNet(hidden_dim=100).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 100
for i in range(num_epochs):
    # training
    model.train()
    for x, y in train_loader:  # x.shape = (batchsize, 1, 28, 28)
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # testing
    test_loss = 0
    correct = 0
    model.eval()
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            pred = pred.argmax(dim=1)
            correct += (pred == y).sum().item()
    
    print(f"{correct}/{len(test_loader.dataset)} "
          f"({correct / len(test_loader.dataset):.3f})")
    

## There is more:

* Model checkpointing/restoring models
* Using pretrained models
* Tuning the optimization procedure
* (Tracking experiments)

General tutorials: 
* https://pytorch.org/tutorials/

Avoiding boilerplate code:
* https://skorch.readthedocs.io/en/latest/?badge=latest
* https://www.pytorchlightning.ai/

Pretrained models:
* https://pytorch.org/vision/stable/models.html (Vision)
* https://huggingface.co/models (NLP)

Experiment tracking (very useful, but skip this for now):
* guild.ai
* wandb.ai
* dvc.org
* maaaany more...
