In [0]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


##Init Weights and Biases

In [0]:
%pip install wandb -q
import wandb
wandb.login()

True

## Define the transformations

In [0]:
# Define transforms for data preprocessing
transform = transforms.Compose([
    transforms.ToTensor()
])

trainset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)

testset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

## Create data loaders to shuffle and create batches

In [0]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1000, shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

In [0]:
print('No. of train images: {}'.format(len(trainset)))
print('No. of test images: {}'.format(len(testset)))

print('No. of train batches: {}'.format(len(trainloader)))
print('No. of test batches: {}'.format(len(testloader)))

No. of train images: 60000
No. of test images: 10000
No. of train batches: 60
No. of test batches: 100


## Create the Network

In [0]:
class Network(nn.Module):
    
    def __init__(self):
        super(Network, self).__init__()
    
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.pool = nn.MaxPool2d(kernel_size = 2, stride = 2)

        self.fc1 = nn.Linear(12 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 60)
        self.fc3 = nn.Linear(60, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)
        
        x = x.reshape(-1, 12 * 4 * 4)
        
        x = self.fc1(x)
        x = F.relu(x)
        
        x = self.fc2(x)
        x = F.relu(x)
        
        x = self.fc3(x)
        return x

## Function to find the accuracy

In [0]:
def find_acc(pred, label):
    correct = pred.argmax(dim = 1).eq(label)
    accuracy = correct.to(torch.float32).mean().item() * 100
    return accuracy

In [0]:
def train(network, epoch, criterion, optimizer, trainloader):
    loss_train = 0
    acc_train = 0
    network.train()
    
    for step in range(len(trainloader)):

        images , labels = next(iter(trainloader))
        
        # move the images and labels to GPU
        images = images.to(device)
        labels = labels.to(device)
        
        pred = network(images)
        
        # clear all the gradients before calculating them
        optimizer.zero_grad()
        
        # find the loss for the current step
        loss_train_step = criterion(pred , labels)
        
        # find accuracy
        acc_train_step = find_acc(pred, labels)
        
        # calculate the gradients
        loss_train_step.backward()
        
        # update the parameters
        optimizer.step()
        
        loss_train += loss_train_step.item()
        acc_train += acc_train_step  
            
        loss_train /= len(trainloader)
        acc_train /= len(testloader)

        return loss_train, acc_train  
        
def validate(network, epoch, criterion, testloader): 
    loss_valid = 0
    acc_valid = 0       
    network.eval()  

    for step in range(len(testloader)):

        images , labels = next(iter(testloader))
        
        # move the images and labels to GPU
        images = images.to(device)
        labels = labels.to(device)
        
        pred = network(images)
        
        # clear all the gradients before calculating them
        optimizer.zero_grad()
        
        # find the loss and acc for the current step
        loss_valid_step = criterion(pred , labels)
        acc_valid_step = find_acc(pred, labels)
      
        loss_valid += loss_valid_step.item()
        acc_valid += acc_valid_step

        loss_valid /= len(trainloader)
        acc_valid /= len(testloader)

        return loss_valid, acc_valid

## Train the Network

In [0]:
# WandB – Initialize a new run
wandb.init(name='try', notes='This is a trial',entity="arkalim", project="pytorch-fashion_mnist")
wandb.watch_called = False # Re-run the model without restarting the runtime, unnecessary after our next release

# WandB – Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config          # Initialize config
config.epochs = 250             
config.lr = 0.01    

# create an instance of the Network    
network = Network().to(device)

# loss defined using torch.nn
criterion = nn.CrossEntropyLoss()

# define the optimizer
optimizer = optim.Adam(network.parameters(), config.lr)

# note the starting time to find the total time elapsed
start_time = time.time()

for epoch in range(1,config.epochs+1):
    
    loss_train, acc_train = train(network, epoch, criterion, optimizer, trainloader)
    loss_valid, acc_valid = validate(network, epoch, criterion, testloader)
    
    print('Epoch: {}  Train Loss: {}  Train Acc: {}  Valid Loss: {}  Valid Acc: {}'.format(epoch, loss_train, acc_train, loss_valid, acc_valid))

    wandb.log({
        "Train Loss": loss_train,
        "Train Acc": acc_train,
        "Valid Loss": loss_valid,
        "Valid Acc": acc_valid})
    
# find the time at the end of training    
end_time = time.time()

total_time = end_time - start_time
print("Total time taken : {}".format(total_time))

Epoch: 1  Train Loss: 0.03843596378962199  Train Acc: 0.10300000756978989  Valid Loss: 0.03818322420120239  Valid Acc: 0.12999999523162842
Epoch: 2  Train Loss: 0.038245288530985515  Train Acc: 0.12400000542402267  Valid Loss: 0.03743819793065389  Valid Acc: 0.11999999731779099
Epoch: 3  Train Loss: 0.037571450074513756  Train Acc: 0.10000000149011612  Valid Loss: 0.035525242487589516  Valid Acc: 0.28999999165534973
Epoch: 4  Train Loss: 0.03607268333435058  Train Acc: 0.27900001406669617  Valid Loss: 0.03320521116256714  Valid Acc: 0.38999998569488525
Epoch: 5  Train Loss: 0.033050638437271115  Train Acc: 0.38700002431869507  Valid Loss: 0.03045504689216614  Valid Acc: 0.38999998569488525
Epoch: 6  Train Loss: 0.031086252133051554  Train Acc: 0.3920000195503235  Valid Loss: 0.034160979588826496  Valid Acc: 0.17999999225139618
Epoch: 7  Train Loss: 0.033322455485661824  Train Acc: 0.2460000067949295  Valid Loss: 0.027882428963979085  Valid Acc: 0.3799999952316284
Epoch: 8  Train Loss: 

## Testing the model

In [0]:
def test_model(model):

    start_time = time.time()

    num_correct = 0
    accuracy = 0

    # turning off backprop and gradient calculation.
    # this improves performance 
    with torch.no_grad():

        for batch in testloader:

            images, labels = batch
            images = images.to(device)
            labels = labels.to(device)

            total_images = len(testset)

            pred = model(images)

            num_correct_batch = pred.argmax(dim = 1).eq(labels).sum().item()

            accuracy_batch = pred.argmax(dim = 1).eq(labels).float().mean().item()

            num_correct += num_correct_batch
            accuracy += accuracy_batch

        accuracy /= len(testloader)

    print('Total number of test images: {}'.format(total_images))
    print('Total number of correct predictions: {}'.format(num_correct))
    print('Accuracy: {}'.format(accuracy * 100))

    end_time = time.time()
    print("Elapsed Time : {}".format(end_time - start_time))
    
# test the trained network    
test_model(network)    

Total number of test images: 10000
Total number of correct predictions: 8522
Accuracy: 85.21999830007553
Elapsed Time : 1.2061548233032227


## Save and Restore

### Save and Load Model's Parameters
#### Saving the model's state_dict()

In [0]:
path = 'network_weights.pth'

# Save the parameters
torch.save(network.state_dict(), path)

#### Loading the model's state_dict()


In [0]:
# create a new model object 
new_network = Network()

# move the network to GPU
new_network.to(device)

# load the network's parameters
new_network.load_state_dict(torch.load(path))

# set the network into evaluate mode
new_network.eval()

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (fc3): Linear(in_features=60, out_features=10, bias=True)
)

#### Test

In [0]:
test_model(new_network)

Total number of test images: 10000
Total number of correct predictions: 8522
Accuracy: 85.21999830007553
Elapsed Time : 1.2441446781158447


## Saving and Loading entire Model

In [0]:
path = 'full_network.pth'

# save the model
torch.save(network, path)

# load the model
new_model = torch.load(path)
new_model.eval()

  "type " + obj.__name__ + ". It won't be checked "


AttributeError: ignored

#### Test

In [0]:
test_model(new_model)