In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import multiprocessing

In [2]:
# use gpu for training if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model specification

In [3]:
class multiple_layer_fc_network(nn.Module):
    def __init__(self, input_size, num_classes):
        super(multiple_layer_fc_network, self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

### Dimensionality checking with random inputs

In [4]:
x = torch.randn(64, 784)
model = multiple_layer_fc_network(input_size=784, num_classes=10)
out = model(x)
print(out.shape)

torch.Size([64, 10])


### Global Variables Specification

In [5]:
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 50
num_workers = 16 #multiprocessing.cpu_count() 

early_stopping_criteria= 5

val_proportion = 0.2

data_dir = "/mnt/addtional_data_ssd/pytorch_tutorials/data"

### Set up Dataset and DataLoader

In [6]:
train_dataset = datasets.FashionMNIST(root=data_dir, train=True, transform=transforms.ToTensor(), download=True)
test_dataset  = datasets.FashionMNIST(root=data_dir, train=False, transform=transforms.ToTensor(), download=True)


val_size = int(val_proportion*len(train_dataset))
train_size = len(train_dataset) - val_size
test_size = len(test_dataset)

print(f"number samples in train: {train_size}")
print(f"number samples in val: {val_size}")
print(f"number samples in test: {test_size}")

train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

number samples in train: 48000
number samples in val: 12000
number samples in test: 10000


In [7]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [8]:
# check the dimensions of the data
image, target = train_dataset[0]
print("input image has shape:")
print(image.shape)

input image has shape:
torch.Size([1, 28, 28])


### Model Initialization, Loss, Optimizer

In [9]:
clf = multiple_layer_fc_network(input_size=input_size, num_classes=num_classes)
clf.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(clf.parameters(), lr=learning_rate)

### Train the model

In [10]:
best_val_loss = float('inf')
early_stopping_step = 0

for epoch in range(num_epochs):
    running_train_loss = 0.0
    for batch_idx, (images, targets) in enumerate(train_loader):
        # transfer data to gpu if available
        images, targets = images.to(device), targets.to(device)
        #print(images.shape, targets.shape)
        
        # reshape the images to proper shape
        batch_size = images.shape[0]
        images = images.reshape(batch_size, -1)
        
        # forward pass
        preds = clf(images)
        loss = criterion(preds, targets)
        
        # back prop
        optimizer.zero_grad()
        loss.backward()
        
        # update model params
        optimizer.step()
        
        running_train_loss += loss.item()

        # print training stat every 200 steps
        if ((batch_idx+1) % 200==0):
            print(f"[epoch {epoch} step {batch_idx}] running training loss: {running_train_loss/200:.3f}")
            running_train_loss = 0.0
            
    # evaluate on the validation set every epoch
    with torch.no_grad():
        val_loss = 0.0
        num_val_samples = 0 
        correct = 0
        for images, targets in val_loader:
            images, targets = images.to(device), targets.to(device)
            batch_size = images.shape[0]
            images = images.reshape(batch_size, -1)
            preds = clf(images)
            loss = criterion(preds, targets)
            val_loss += loss.item() * images.shape[0]
            num_val_samples += batch_size
            # calculate accuracy
            preds = clf(images)
            _, pred_labels = torch.max(preds, dim=1)
            correct += (pred_labels==targets).sum().item()
    epoch_val_loss= val_loss/num_val_samples
    print(f"[epoch {epoch}] val loss: {epoch_val_loss:.3f}, accuracy: {100*correct/num_val_samples:.3f}%")
    
    # early stopping
    if (epoch_val_loss < best_val_loss):
        best_val_loss = epoch_val_loss
        early_stopping_step = 0
        print(f"current best validation loss: {best_val_loss:.3f}")
    else:
        early_stopping_step += 1
        print(f"validation loss stops decreasing for {early_stopping_step} epoch")
        if (early_stopping_step==early_stopping_criteria):
            print("early stopping criteria reached")
            break
    print("--------------------------------------------")
            

[epoch 0 step 199] running training loss: 0.806
[epoch 0 step 399] running training loss: 0.539
[epoch 0 step 599] running training loss: 0.496
[epoch 0] val loss: 0.444, accuracy: 84.483%
current best validation loss: 0.444
--------------------------------------------
[epoch 1 step 199] running training loss: 0.448
[epoch 1 step 399] running training loss: 0.439
[epoch 1 step 599] running training loss: 0.424
[epoch 1] val loss: 0.420, accuracy: 84.708%
current best validation loss: 0.420
--------------------------------------------
[epoch 2 step 199] running training loss: 0.390
[epoch 2 step 399] running training loss: 0.384
[epoch 2 step 599] running training loss: 0.378
[epoch 2] val loss: 0.395, accuracy: 86.017%
current best validation loss: 0.395
--------------------------------------------
[epoch 3 step 199] running training loss: 0.351
[epoch 3 step 399] running training loss: 0.362
[epoch 3 step 599] running training loss: 0.354
[epoch 3] val loss: 0.374, accuracy: 86.467%
c

### Test the network on the test data

In [11]:
with torch.no_grad():
    test_loss = 0.0
    num_test_samples = 0 
    correct = 0
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        batch_size = images.shape[0]
        images = images.reshape(batch_size, -1)
        preds = clf(images)
        loss = criterion(preds, targets)
        test_loss += loss.item() * images.shape[0]
        num_test_samples += batch_size
        # calculate accuracy
        preds = clf(images)
        _, pred_labels = torch.max(preds, dim=1)
        correct += (pred_labels==targets).sum().item()
print(f"testing loss: {test_loss/num_test_samples:.3f}, testing accuracy: {100*correct/num_test_samples:.3f}%")


testing loss: 0.359, testing accuracy: 87.880%
