# Parameter tuning

**IMPORTANT: Please zip up and submit your TensorBoard log files with your homework. That will help me to see what you were looking at as you went through your tuning process.**


In [75]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
torchvision.disable_beta_transforms_warning()
import torchvision.transforms.v2 as transforms


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineRenderer.figure_format = 'retina'


import torch.utils.tensorboard as tb
import datetime
import os
from tqdm.notebook import tqdm
import time 


### Setting up the data and tensorboard

Make directories to save the models and logs in.

In [76]:
if not os.path.exists("logs"):
    os.mkdir("logs")

if not os.path.exists("models"):
    os.mkdir("models")


In [78]:
# Setting the transform so that the images are put in the right format
transform = transforms.Compose([
    transforms.ToImage(),
    transforms.ConvertImageDtype(),
])

# Load data
cifar = torchvision.datasets.CIFAR10("../../data/torch/cifar", download=True, transform=transform)

# Set the training size to 80% of the total data
train_size = int(0.8 * len(cifar))

# Split data into training and validation sets
train_data, valid_data = torch.utils.data.random_split(cifar, [train_size, len(cifar) - train_size])

# These are the classes within the dataset
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']


Files already downloaded and verified


Get the necessary constants and define normalization for this dataset

In [79]:
cifar_mean = (0.4914, 0.4822, 0.4465)
cifar_std = (0.2470, 0.2435, 0.2616)

normalize = transforms.Normalize(cifar_mean, cifar_std)

### setting up the CNN


I want to set up my CNN such that I can change the convolutions in each run, partially because that will make a difference and partially because i don't fully understand the math. I will always end with flattening and a linear layer, but I will need to change the parameters for `nn.Linear` depending on the previous convolutions.

In [81]:
# Code to test if the layers are actually going to work


# given 3x32x32
def valid_arch(arch, input_size=(3,32,32)):

    # (Cin, H, W) = input_size
    # (Cout, _, Kh, Kw) = kernel.shape
    # output = np.zeros((Cout, H - Kh + 1, W - Kw + 1))
    # # next layer should be output size?
    
    '''
    assert kernel shape matches (does the kernel have to fit nicely into the image?)
    
    prev_size = input_size

    for c in conv_list:
        
        (Cout, _, Kh, Kw) = kernel.shape
        next_layer = next in c
        assert(next_layer.shape() = (Cout, H - Kh + 1, W - Kw + 1))
        



                
        # have some checker function to make sure the math is right 


        # have learning rate that works roughly ok, the pick arch, then tune learning rate and other parameters
        # five to eight total



        # sizing output should be input, in  terms of num of channels, 
        # add padding, then subtract kernel size -1, then divide by the stride size, then make sure that the output matches input
        # can use nn.adaptiveave pool.2d, and give it 1x1 output and it'll figure it out for you
        

    
    '''
    return True

In [82]:
class CNN(nn.Module):

    def __init__(self, arch):
        super().__init__()
        assert(valid_arch(arch))
        # Dynamically create attributes for each convolution
        self.arch = arch
        self.depth = len(arch)

        # for i in range(len(layers)):
        #     setattr(CNN, 'l'+str(i), layers[i])


        # self.flatten = nn.Flatten()

        # Fix linear inputs so it's dynamic
        # if you do average pooling then after flattening the number of inputs you just put number of channels into the linear layer
        # self.linear =  nn.Linear(32*8*8, 10)

    def forward(self, x):
        
        for i in range(self.depth):
            current = self.arch[i]
            x = current(x)

        return x

### Training

general strategy: find a good learning rate, then find good arch, then tune learning rate and other params


include scheduling, but only using one. will find a good learning rate and see if i can tune it by changing the patience



In [91]:
def train(run, 
          arch=0,
          lr=1e-3,
          epochs=10,
          batch_size=64,
          reg=1e-5,
          device = 'mps', factor=0.1, patience=3):
    
    print("Learning rate:", lr)
    print("Factor:", factor)
    print("Patience:", patience)
    
    start = time.time()
    data_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=False)

    augments = transforms.Compose([
            transforms.RandomHorizontalFlip(0.5),
            transforms.RandomGrayscale(0.1),
            transforms.ColorJitter(
                brightness=(0.9,1.1),
                contrast=0,
                saturation=0,
                hue=0),
            transforms.RandomCrop(
                size=32,
                padding=2,
                fill=cifar_mean)
        ])
    
    print(arch)
    print(archs[arch])
    model = CNN(archs[arch]).to(device)

    loss = nn.CrossEntropyLoss()
    opt = optim.SGD(model.parameters(), momentum=0.9, lr=lr, weight_decay=reg)
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='max', factor=factor, patience=patience)

    # Naming the training runs
    name = str(run) + '--' # Run number
    # name += ":".join(map(str, convolutions))
    name += 'arch' + str(arch)
    name += '-lr-' + str(lr) + '-bs-' + str(batch_size) + '-epochs-' + str(epochs) + '-reg-' + str(reg)
    name += '-pat-' + str(patience) + '-fac-' + str(factor)
    logger = tb.SummaryWriter(os.path.join("logs/", name))
    global_step = 0


    for i in tqdm(range(epochs)):

        # Train

        model.train()
        for batch_xs, batch_ys in data_loader:
            batch_xs = augments(batch_xs).to(device)
            batch_ys = batch_ys.to(device)

            preds = model(normalize(batch_xs))
            loss_val = loss(preds, batch_ys)

            logger.add_scalar('loss', loss_val, global_step=global_step)

            opt.zero_grad()
            loss_val.backward()
            opt.step()

            logger.add_scalar("Training accuracy", (preds.argmax(dim=1) == batch_ys).float().mean(), global_step=global_step)

            global_step += 1
        
        # Evaluate
        model.eval()
        valid_accs = []
        for batch_xs, batch_ys in valid_loader:
            batch_xs = batch_xs.to(device)
            batch_ys = batch_ys.to(device)
            valid_preds = model(normalize(batch_xs))
            valid_accs.append((valid_preds.argmax(dim=1) == batch_ys).float().mean())
        valid_accuracy = torch.tensor(valid_accs).mean()
        logger.add_scalar("Validation accuracy", valid_accuracy, global_step=global_step)
        scheduler.step(valid_accuracy)
        print("Acc:", valid_accuracy)
        

    logger.add_scalar("training time", time.time() - start, global_step=global_step)
    return model


## Training and tuning


will be a loop

First, we want to make sure that we don this on the GPU:

In [84]:
if torch.backends.mps.is_available():
    device = torch.device('mps')

else:
    decive = 'cpu'

### Architectures and learning rates

Trying some different learning rates


In [89]:
# set up first training loop

# first try some different lr

archs = [torch.nn.ModuleList([
    nn.Conv2d(3, 8, 3, padding=3), nn.ReLU(), nn.Conv2d(8, 16, 3, groups=2, padding=1, stride=2), nn.ReLU(), nn.Conv2d(16, 32, 3, stride=2), nn.Flatten(), nn.Linear(32*8*8, 10)]),
    torch.nn.ModuleList([
        nn.Conv2d(3, 8, (7, 7), stride=2, padding=3), nn.ReLU(), nn.Conv2d(8, 16, 3, groups=2, padding=1), nn.MaxPool2d(2, stride=2), nn.Conv2d(16, 32, 3), nn.AvgPool2d(kernel_size=(6, 6)),nn.Flatten(),nn.Linear(32, 10)
    
    ]),


]

In [90]:
len(archs)

2

In [86]:
learning_rates = [1e-2, 1e-1]

for i in range(len(learning_rates)):
    cnn_model = train(i, arch=0, lr=learning_rates[i])
    torch.save(cnn_model.state_dict(), os.path.join("models/", "CNN" + str(i) + '.pt'))

Learning rate: 0.01
Factor: 0.1
Patience: 3


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.4553)
Acc: tensor(0.5239)


KeyboardInterrupt: 

results: it seems like in general 1e-2 had the best accuracy. after that, the accuracy plumeted about 30 percent. now i am going to tune the scheduler

### Tuning the scheduler

In [87]:
patience = [5,4,3,2,1]

for i in range(len(patience)):
    cnn_model = train(i, arch=0, lr=1e-2, patience=patience[i])

Learning rate: 0.01
Factor: 0.1
Patience: 5


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.5704)
Acc: tensor(0.5930)
Acc: tensor(0.6009)
Acc: tensor(0.6138)
Acc: tensor(0.6088)
Acc: tensor(0.6030)
Acc: tensor(0.6145)
Acc: tensor(0.6186)
Acc: tensor(0.6153)
Acc: tensor(0.6325)
Learning rate: 0.01
Factor: 0.1
Patience: 4


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.6276)
Acc: tensor(0.6297)
Acc: tensor(0.6338)
Acc: tensor(0.6398)
Acc: tensor(0.6166)
Acc: tensor(0.6105)
Acc: tensor(0.6437)
Acc: tensor(0.6303)
Acc: tensor(0.6359)
Acc: tensor(0.6238)
Learning rate: 0.01
Factor: 0.1
Patience: 3


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.6299)
Acc: tensor(0.6304)
Acc: tensor(0.6283)
Acc: tensor(0.6431)
Acc: tensor(0.6224)
Acc: tensor(0.6365)
Acc: tensor(0.6352)
Acc: tensor(0.6240)
Acc: tensor(0.6603)
Acc: tensor(0.6682)
Learning rate: 0.01
Factor: 0.1
Patience: 2


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.6323)
Acc: tensor(0.6399)
Acc: tensor(0.6330)
Acc: tensor(0.6334)
Acc: tensor(0.6377)
Acc: tensor(0.6640)
Acc: tensor(0.6688)
Acc: tensor(0.6700)
Acc: tensor(0.6711)
Acc: tensor(0.6719)
Learning rate: 0.01
Factor: 0.1
Patience: 1


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.6320)
Acc: tensor(0.6197)
Acc: tensor(0.6464)
Acc: tensor(0.6307)
Acc: tensor(0.6470)
Acc: tensor(0.6300)
Acc: tensor(0.6282)
Acc: tensor(0.6668)
Acc: tensor(0.6721)
Acc: tensor(0.6724)


patience = 1 was best, but not by that much

now doing the architecture

In [92]:
cnn_model = train(i, arch=0, lr=1e-2, patience=1)


Learning rate: 0.01
Factor: 0.1
Patience: 1
0
ModuleList(
  (0): Conv2d(3, 8, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3))
  (1): ReLU()
  (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=2)
  (3): ReLU()
  (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
  (5): Flatten(start_dim=1, end_dim=-1)
  (6): Linear(in_features=2048, out_features=10, bias=True)
)


  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [93]:

cnn_model = train(i, arch=1, lr=1e-2, patience=1)

Learning rate: 0.01
Factor: 0.1
Patience: 1
1
ModuleList(
  (0): Conv2d(3, 8, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (1): ReLU()
  (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2)
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (5): AvgPool2d(kernel_size=(6, 6), stride=(6, 6), padding=0)
  (6): Flatten(start_dim=1, end_dim=-1)
  (7): Linear(in_features=32, out_features=10, bias=True)
)


  0%|          | 0/10 [00:00<?, ?it/s]

Acc: tensor(0.3739)
Acc: tensor(0.4291)
Acc: tensor(0.4200)
Acc: tensor(0.4708)
Acc: tensor(0.4492)
Acc: tensor(0.4811)
Acc: tensor(0.4868)
Acc: tensor(0.5079)
Acc: tensor(0.4984)
Acc: tensor(0.5119)


results from arch
arch 0 was better
arch 1 not as good

## Conclusion

In [None]:
# visualize for fun