# Improving models

In [3]:
import pandas as pd
import numpy as np

animals = pd.read_csv('animals.csv')
animals.head()

Unnamed: 0,name,hair,feathers,eggs,milk,predator,fins,legs,tail,type
0,skimmer,0,1,1,0,1,0,2,1,2
1,gull,0,1,1,0,1,0,2,1,2
2,seahorse,0,0,1,0,0,1,0,1,4
3,tuatara,0,0,1,0,1,0,4,1,3
4,squirrel,1,0,0,1,0,0,2,1,1


In [15]:
# all columns except the name and the type
features = animals.iloc[:, 1:-1]
X = features.to_numpy()
X

array([[0, 1, 1, 0, 1, 0, 2, 1],
       [0, 1, 1, 0, 1, 0, 2, 1],
       [0, 0, 1, 0, 0, 1, 0, 1],
       [0, 0, 1, 0, 1, 0, 4, 1],
       [1, 0, 0, 1, 0, 0, 2, 1]])

In [16]:
# target aka ground truth
target = animals.iloc[:, -1]
y = target.to_numpy()
y

array([2, 2, 4, 3, 1])

In [17]:
import torch
from torch.utils.data import TensorDataset

# setup tensor dataset
dataset = TensorDataset(torch.tensor(X).float(), torch.tensor(y).float())

In [19]:
# access individual samples
sample = dataset[0]
input_sample, label_sample = sample
print("input: %s, label: %s" % (input_sample, label_sample))

input: tensor([0., 1., 1., 0., 1., 0., 2., 1.]), label: tensor(2.)


In [21]:
from torch.utils.data import DataLoader

batch_size = 2 # how many samples per iteration
shuffle = True

# create a dataloader and iterate
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
for batch_inputs, batch_labels in dataloader:
    print("inputs: %s, labels: %s" % (batch_inputs, batch_labels))
    

inputs: tensor([[0., 1., 1., 0., 1., 0., 2., 1.],
        [0., 0., 1., 0., 1., 0., 4., 1.]]), labels: tensor([2., 3.])
inputs: tensor([[1., 0., 0., 1., 0., 0., 2., 1.],
        [0., 0., 1., 0., 0., 1., 0., 1.]]), labels: tensor([1., 4.])
inputs: tensor([[0., 1., 1., 0., 1., 0., 2., 1.]]), labels: tensor([2.])


## Water potability example

In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

potability = pd.read_csv('water_potability.csv')
potability.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,0.587349,0.577747,0.386298,0.568199,0.647347,0.292985,0.654522,0.795029,0.630115,0
1,0.643654,0.4413,0.314381,0.439304,0.514545,0.356685,0.377248,0.202914,0.520358,0
2,0.388934,0.470876,0.506122,0.524364,0.561537,0.142913,0.249922,0.401487,0.219973,0
3,0.72582,0.715942,0.506141,0.521683,0.751819,0.148683,0.4672,0.658678,0.242428,0
4,0.610517,0.532588,0.237701,0.270288,0.495155,0.494792,0.409721,0.469762,0.585049,0


In [9]:
# Load the different columns into two PyTorch tensors
features = torch.tensor(potability[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].to_numpy()).float()
target = torch.tensor(potability['Potability'].to_numpy()).float()

# Create a dataset from the two generated tensors
dataset = TensorDataset(features, target)

# Create a dataloader using the above dataset
dataloader = DataLoader(dataset, shuffle=True, batch_size=2)
x, y = next(iter(dataloader))

# Create a model using the nn.Sequential API
model = nn.Sequential(nn.Linear(4, 2), nn.Linear(2, 1))
output = model(features)
print(output)

tensor([[0.5964],
        [0.6105],
        [0.5581],
        ...,
        [0.6514],
        [0.5759],
        [0.5717]], grad_fn=<AddmmBackward0>)


## Calculate training loss
Split the data into train, validate and test datasets. Create a DataLoader for each.

In [None]:
# the training loop
for epoch in range(num_epochs):
    training_loss = 0.0
    for i, data in enumerate(trainingloader, 0):
        # set gradients to 0
        optimizer.zero_grad()
        # get feature and target
        feature, target = data
        # forward pass
        pred = model(feature)
        # compute loss and gradients
        loss = criterion(pred, target)
        loss.backward()
        # update model params
        optimizer.step()
        # calculate and sum losses
        training_loss += loss.item()
    epoch_loss = training_loss / len(trainloader) # len = number of batches in dataset

## Calculate validation loss
After the training epoch

In [None]:
validation_loss = 0.0
model.eval() # switch to evaluation mode
with torch.no_grad(): # speed up
    for i, data in enumerate(validationloader, 0):
        # set gradients to 0
        optimizer.zero_grad()
        # get feature and target
        feature, target = data
        # forward pass
        pred = model(feature)
        # compute loss and gradients
        loss = criterion(pred, target)
        loss.backward()
        # update model params
        optimizer.step()
        # calculate and sum losses
        validation_loss += loss.item()
epoch_loss = validation_loss / len(validationloader) # len = number of batches in dataset
model.train() # switch back to training before running the next training epoch

## Other metrics

In [2]:
import torchmetrics

# create accuracy metric using torchmetrics
metric = torchmetrics.Accuracy(task="multiclass", num_classes=3) # performing classification over 3 classes
for i, data in enumerate(dataloader, 0):
    # get feature and target
    feature, target = data
    # forward pass
    pred = model(feature)
    # calc accuracy over the batch
    acc = metric(pred, labels.argmax(dim=-1)) 
# calc accuracy over the whole epoch
acc = metric.compute()
print("acc on all data %f" % acc)
# reset the metric for the next epoch
metric.reset()
    


## Overfitting
* Reduce model size or add dropout layer
* Force the parameters to remain small by applying weight decay
* Augment data or get more

### Dropout

In [1]:
# example dropout
model = nn.Sequential(
    nn.Linear(8, 4),
    nn.ReLU(),
    nn.Dropout(p=0.5)
)
features = torch.randn((1, 8))
model(i)

NameError: name 'nn' is not defined

Dropout is usually added after activatin function. Have to switch the model between model.train() and model.eval()

### Weight decay
Weight decay value should be 0 to 1, typically small, like 1e-3, 1e-4 

In [None]:
# example weight decay
optimizer = optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-4)

## Improve performance

* Create a model (large enough) that can overfit the training set
* Reduce overfitting
* Tune hyperparams

In [2]:
# start by overfitting a single data point
features, labels = next(iter(trainloader))
for i in range(1e3):
    outputs = model(features)
    loss = criterion(outputs, labels)
    loss.backward()
    optimier.step()

NameError: name 'trainloader' is not defined

The accuracy should be close to one and the loss close to 0.
Then overfit the whole training set. Hyperparams should have their default values.
Plot training and validation accuracy.

After that, maximize validation accuracy applying dropout layers, data augmentation, weight decay, reducing model capacity

After that, tune hyperparams, using grid search or random search

In [None]:
# example grid search
for factor in range(2, 6):
    lr = 10 ** -factor

In [None]:
# example random search
values = []
for idx in range(10):
    # Randomly sample a learning rate factor between 2 and 4
    factor = np.random.uniform(2, 4)
    lr = 10 ** -factor
    
    # Randomly select a momentum between 0.85 and 0.99
    momentum = np.random.uniform(0.85, 0.99)
    
    values.append((lr, momentum))