<a href="https://colab.research.google.com/github/Devininthelab/Machine-Learning-Collection/blob/master/Machine-Learning-Collection/ML/Pytorch/Basics/13_Pytorch_common_mistakes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Devininthelab/Machine-Learning-Collection.git

Cloning into 'Machine-Learning-Collection'...
remote: Enumerating objects: 1270, done.[K
remote: Counting objects: 100% (315/315), done.[K
remote: Compressing objects: 100% (179/179), done.[K
remote: Total 1270 (delta 165), reused 136 (delta 136), pack-reused 955[K
Receiving objects: 100% (1270/1270), 97.31 MiB | 18.54 MiB/s, done.
Resolving deltas: 100% (526/526), done.


# 1. Overfit a batch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms



In [None]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = NN(784, 10)


In [None]:
#hyper params
input_size = 784
num_classes = 10
lr = 0.001
# chỉnh batch_size về 1 để overfit
batch_size = 1
num_epochs = 5

In [None]:
# Load data
train_dataset = datasets.MNIST(root='./dataset', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = datasets.MNIST(root='./dataset', train=False, transform=transforms.ToTensor(), download=True)
test_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = NN(input_size=input_size, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

Để overfit 1 batch, điều đó chứng tỏ model vẫn còn cap để học. Từ đó mới tăng batch_size lên để train


In [None]:
data, targets = next(iter(train_loader))

In [None]:
for epoch in range (num_epochs):
    print(f'Epoch [{epoch+1} / {num_epochs}]')
    #for batch_idx, (data, targets) in enumerate(train_loader):
    data = data.to(device)
    targets = targets.to(device)

    data = data.reshape(data.shape[0], -1)

    scores = model(data)
    loss = criterion(scores, targets)
    print(loss)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

Epoch [1 / 5]
tensor(0.0170, grad_fn=<NllLossBackward0>)
Epoch [2 / 5]
tensor(0.0147, grad_fn=<NllLossBackward0>)
Epoch [3 / 5]
tensor(0.0122, grad_fn=<NllLossBackward0>)
Epoch [4 / 5]
tensor(0.0099, grad_fn=<NllLossBackward0>)
Epoch [5 / 5]
tensor(0.0078, grad_fn=<NllLossBackward0>)


-> Loss decreasing -> đủ capability để học

# 2. Forgot toggle train / val

Khi inference, cần bật model.eval() rồi quay lại model.train()
-> reasons là bởi trong 1 số trường hợp, ta bật dropout lên để chạy, hoặc khi inference cũng ko dùng batch norm

In [None]:
#hyper params
input_size = 784
num_classes = 10
lr = 0.001
# chỉnh batch_size về 1 để overfit
batch_size = 16
num_epochs = 3

In [None]:
checkpoint = torch.load('/content/drive/MyDrive/Self study/Aladin Pearson/mycheckpoint.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])


In [None]:
for epoch in range (num_epochs):
    print(f'Epoch [{epoch+1} / {num_epochs}]')
    losses = []
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        data = data.reshape(data.shape[0], -1)

        scores = model(data)
        loss = criterion(scores, targets)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
    print(f'Loss at epoch {epoch + 1} is {sum(losses)/len(losses):.2f}')


Epoch [1 / 3]


KeyboardInterrupt: 

In [None]:
def check_accuracy(model, loader):
    num_correct = 0
    num_samples = 0
    #model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            x = x.reshape(x.shape[0], -1)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(
            f"Got {num_correct} / {num_samples} "
            f"With accuracy {float(num_correct) / float(num_samples) * 100:.2f}"
        )


In [None]:
check_accuracy(model, test_loader)
model.eval()
print('Eval: \n')
check_accuracy(model, test_loader)

Got 59218 / 60000 With accuracy 98.70
Eval: 

Got 59218 / 60000 With accuracy 98.70


Ở đây kết quả giống nhau vì mình quên cài drop out =)))

# 3. Forgot zero_grad()

Nếu ko zero_grad(), sử dụng gradients của all the accumulated batches

Quên zero grad cái biết ai to ngay

In [None]:
checkpoint = torch.load('/content/drive/MyDrive/Self study/Aladin Pearson/mycheckpoint.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])

In [None]:
for epoch in range (num_epochs):
    print(f'Epoch [{epoch+1} / {num_epochs}]')
    losses = []
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        data = data.reshape(data.shape[0], -1)

        scores = model(data)
        loss = criterion(scores, targets)
        losses.append(loss.item())

        #optimizer.zero_grad()
        loss.backward()

        optimizer.step()
    print(f'Loss at epoch {epoch + 1} is {sum(losses)/len(losses):.2f}')


Epoch [1 / 3]
Loss at epoch 1 is 1.77
Epoch [2 / 3]
Loss at epoch 2 is 1.64
Epoch [3 / 3]
Loss at epoch 3 is 1.77


Để load lại state dict lên và cho thấy zero grad có vai trò như nào nhé

In [None]:
checkpoint = torch.load('/content/drive/MyDrive/Self study/Aladin Pearson/mycheckpoint.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])

In [None]:
for epoch in range (num_epochs):
    print(f'Epoch [{epoch+1} / {num_epochs}]')
    losses = []
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        data = data.reshape(data.shape[0], -1)

        scores = model(data)
        loss = criterion(scores, targets)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
    print(f'Loss at epoch {epoch + 1} is {sum(losses)/len(losses):.2f}')


Epoch [1 / 3]
Loss at epoch 1 is 0.02
Epoch [2 / 3]
Loss at epoch 2 is 0.02
Epoch [3 / 3]
Loss at epoch 3 is 0.02


# 4. Using SoftMax With Cross Entropy Loss

-> Trong cross entropy loss của pytorch, cần truyền vào là các giá trị logits thôi.
-> Nếu dùng softmax xong lại cross entropy loss -> rất dễ tạo ra vanishing problem

In [None]:
class NN2(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.softmax = nn.SoftMax(dim=1)

    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.softmax(self.fc2(x))
        return x

In [None]:
model = NN2(784, 10)


AttributeError: module 'torch.nn' has no attribute 'SoftMax'

# 5. Using bias with batchnorm

Trong batch norm, bắt buộc phải set bias = TRUE để nó dịch chuyển cái distribution. Nếu ko model rất dễ bị giảm accuracy

#6. Using View as permute

In [None]:
import torch
x = torch.tensor([[1,2,3], [4, 5, 6]])
print(x)

print(x.view(3, 2))
print(x.permute(1, 0)) # this is transposed

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 4],
        [2, 5],
        [3, 6]])


# 7. Incorrect Data Augmentation

In [None]:
import torchvision.transforms as transforms
my_transforms = transforms.Compose(
    [
        transforms.RandomVerticalFlip(p=1.0),
        transforms.RandomHorizontalFlip(p=1.0),
        transforms.ToTensor()
    ]
)

-> nếu bộ MNIST -> flip lộn ngược -> ra cái gì vậy trời ?? -> cân nhắc transforms

# 8. Not shuffling the data

# 9. Not normalzing the data

In [None]:
my_transforms = transforms.Compose([
    transforms.ToTensor(),
    # transforms.Normalize(mean=(0.1307,), std=(0.3801,))
])

-> cứ normalize thì chạy tốt hơn

# 10. Not clipping gradients (GRUs, LSTMs, RNNs)

sử dụng gradient clipping ( do exploding/ vanishing)

In [None]:
for epoch in range (num_epochs):
    print(f'Epoch [{epoch+1} / {num_epochs}]')
    #for batch_idx, (data, targets) in enumerate(train_loader):
    data = data.to(device)
    targets = targets.to(device)

    data = data.reshape(data.shape[0], -1)

    scores = model(data)
    loss = criterion(scores, targets)
    print(loss)

    optimizer.zero_grad()
    loss.backward()
    ##################################################
    # PUT CODE HERE #
    torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
    ##################################################
    optimizer.step()