In [0]:
# source code inspireed by
# https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html#model-training-and-validation-code

from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)



root = './data'

transform = transforms.Compose([
    # you can add other transformations in this list
    transforms.ToTensor()
])

train_set = datasets.FashionMNIST(root=root, train=True, transform=transform, download=True)
test_set = datasets.FashionMNIST(root=root, train=False, transform=transform, download=True)


# hyperparameter
# TODO Find good hyperparameters
batch_size = 16
learning_rate = 0.0001
momentum = 0.95
beta1 = momentum
beta2 = 0.99
num_epochs = 50
ith_batch_display = 10000//batch_size


print("================================================================================================")
print("Architectural Change:")
print("\t - doubled Conv Filters")
print("\t - changed to Adam Optimizer ")
print("\t - added Residual Shortcuts")
print("\t - added BAtch Norm")
print("\t - changed 1st two Kernel Sizes to 5")
print("================================================================================================")
print()
print("Batch Size:", batch_size)
print("learning_rate:", learning_rate)
print("beta1:", beta1)
print("beta2:", beta2)
print("num_epochs:", num_epochs)
print("ith_batch_display:", ith_batch_display)
print()

# Load train and test data
data_loaders = {}
data_loaders['train'] = torch.utils.data.DataLoader(
                 dataset=train_set,
                 batch_size=batch_size,
                 shuffle=True)
data_loaders['test'] = torch.utils.data.DataLoader(
                dataset=test_set,
                batch_size=batch_size,
                shuffle=False)

# implement your own NNs
    
class MyNeuralNetwork(nn.Module):
    def __init__(self):
        super(MyNeuralNetwork, self).__init__()
        # TODO YOUR CODE HERE
        self.n_img = 28*28
        self.kernel_size1 = 5
        self.padding1 = 2
        self.kernel_size = 3
        self.kernel_size_sc = 1
        self.padding = 1
        self.n_ch0 = 1
        self.n_ch1 = 64
        self.n_ch2 = 64
        self.n_ch3 = 128
        self.n_ch4 = 128
        # self.n_ch5 = 256
        # self.n_ch6 = 256
        self.n_h1 = 512
        self.n_h2 = 10
        self.mp_win_size = 2
        self.mp_stride = self.mp_win_size
        self.p = 0.25
        self.conv_sc02 = nn.Conv2d(self.n_ch0, self.n_ch2, self.kernel_size_sc)
        self.conv1 = nn.Conv2d(self.n_ch0, self.n_ch1, self.kernel_size1, padding = self.padding1)
        self.conv2 = nn.Conv2d(self.n_ch1, self.n_ch2, self.kernel_size1, padding = self.padding1)
        self.bn_conv2 = nn.BatchNorm2d(self.n_ch2)
        self.maxpool = nn.MaxPool2d(self.mp_win_size, self.mp_stride)
        self.dropout = nn.Dropout(self.p)
        self.conv_sc24 = nn.Conv2d(self.n_ch2, self.n_ch4, self.kernel_size_sc)
        self.conv3 = nn.Conv2d(self.n_ch2, self.n_ch3, self.kernel_size, padding = self.padding)
        self.conv4 = nn.Conv2d(self.n_ch3, self.n_ch4, self.kernel_size, padding = self.padding)
        self.bn_conv4 = nn.BatchNorm2d(self.n_ch4)
        # self.conv_sc46 = nn.Conv2d(self.n_ch4, self.n_ch6, self.kernel_size_sc)
        # self.conv5 = nn.Conv2d(self.n_ch4, self.n_ch5, self.kernel_size, padding = self.padding)
        # self.conv6 = nn.Conv2d(self.n_ch5, self.n_ch6, self.kernel_size, padding = self.padding)
        # self.bn_conv6 = nn.BatchNorm2d(self.n_ch6)
        self.fc1 = nn.Linear(6272 , self.n_h1) 
        self.bn_fc1 = nn.BatchNorm1d(self.n_h1)
        self.fc2 = nn.Linear(self.n_h1 , self.n_h2)
    def forward(self, x):
        # TODO YOUR CODE HERE
        residual = self.conv_sc02(x)
        x = F.relu(self.conv1(x))
        x = self.conv2(x)
        x = F.relu(self.bn_conv2(x + residual))
        x = self.maxpool(x)
        x = self.dropout(x)
        residual = self.conv_sc24(x)
        x = F.relu(self.conv3(x))
        x = self.conv4(x)
        x = F.relu(self.bn_conv4(x + residual))
        x = self.maxpool(x)
        x = self.dropout(x)
        # residual = self.conv_sc46(x)
        # x = F.relu(self.conv5(x))
        # x = F.relu(self.bn_conv6(self.conv6(x) + residual ))
        # x = self.maxpool(x)
        # x = self.dropout(x)
        x = x.view(-1, self.num_flat_features(x)) 
        x = F.relu(self.bn_fc1(self.fc1(x)))
        #x = self.dropout(x)
        x = F.softmax(self.fc2(x))
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
    def name(self):
        return "MyNeuralNetwork"



## training
# model = MyNeuralNetwork()

# gpu setup
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyNeuralNetwork().to(device)


#optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(beta1, beta2))


criterion = nn.CrossEntropyLoss()

train_acc_history = []
test_acc_history = []

train_loss_history = []
test_loss_history = []


best_acc = 0.0
since = time.time()
for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    # Each epoch has a training and validation phase
    for phase in ['train', 'test']:
        if phase == 'train':
            model.train()  # Set model to training mode
        else:
            model.eval()  # Set model to evaluate mode

        running_loss = 0.0
        running_corrects = 0

        for batch_idx, (inputs, labels) in enumerate(data_loaders[phase]):
            # ADDON for gpu
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()


            
            # forward
            # track history if only in train
            with torch.set_grad_enabled(phase == 'train'):
                # ADDON for gpu
                outputs = model(inputs)
                # outputs = model(inputs)
                loss = criterion(outputs, labels)

                _, preds = torch.max(outputs, 1)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

            if batch_idx % ith_batch_display == 0:
                print('{} Batch: {} of {}'.format(phase, batch_idx, len(data_loaders[phase])))

        epoch_loss = running_loss / len(data_loaders[phase].dataset)
        epoch_acc = running_corrects.double() / len(data_loaders[phase].dataset)
        
        print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

        # deep copy the model
        if phase == 'test' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        if phase == 'test':
            test_acc_history.append(epoch_acc)
            test_loss_history.append(epoch_loss)
        if phase == 'train':
            train_acc_history.append(epoch_acc)
            train_loss_history.append(epoch_loss)

    print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))

acc_train_hist = []
acc_test_hist = []

acc_train_hist = [h.cpu().numpy() for h in train_acc_history]
acc_test_hist = [h.cpu().numpy() for h in test_acc_history]

plt.title("Validation/Test Accuracy vs. Number of Training Epochs")
plt.xlabel("Training Epochs")
plt.ylabel("Validation/Test Accuracy")
plt.plot(range(1,num_epochs+1),acc_train_hist,label="Train")
plt.plot(range(1,num_epochs+1),acc_test_hist,label="Test")
plt.ylim((0,1.))
plt.xticks(np.arange(1, num_epochs+1, 1.0))
plt.legend()
plt.show()


examples = enumerate(data_loaders['test'])
batch_idx, (example_data, example_targets) = next(examples)
with torch.no_grad():
    output = model.cpu()(example_data)

categories = {
    0:	'T-shirt/top',
    1:	'Trouser',
    2:	'Pullover',
    3:	'Dress',
    4:	'Coat',
    5:	'Sandal',
    6:	'Shirt',
    7:	'Sneaker',
    8:	'Bag',
    9:	'Ankle boot'
}

for i in range(6):
    plt.subplot(2,3,i+1)
    plt.tight_layout()
    plt.imshow(example_data[i][0], cmap='gray', interpolation='none')
    plt.title("Pred: {}".format(
      categories[output.data.max(1, keepdim=True)[1][i].item()]))
    plt.xticks([])
    plt.yticks([])
plt.show()



0it [00:00, ?it/s]

PyTorch Version:  1.3.1
Torchvision Version:  0.4.2
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


26427392it [00:01, 13862731.53it/s]                             


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw


0it [00:00, ?it/s]

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


32768it [00:00, 94782.15it/s]                            
0it [00:00, ?it/s]

Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


4423680it [00:01, 4010347.72it/s]                             
0it [00:00, ?it/s]

Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


8192it [00:00, 33093.51it/s]            


Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Processing...
Done!
Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )
Architectural Change:
	 - doubled Conv Filters
	 - changed to Adam Optimizer 
	 - added Residual Shortcuts
	 - added BAtch Norm
	 - changed 1st two Kernel Sizes to 5

Batch Size: 16
learning_rate: 0.0001
beta1: 0.95
beta2: 0.99
num_epochs: 50
ith_batch_display: 625

Epoch 0/49
----------




train Batch: 0 of 3750
train Batch: 625 of 3750


KeyboardInterrupt: ignored

In [0]:
# 15 Epochs with Given Arch: 80.3 in 3:37 min
# 15 Epochs with Given Arch + Residual: 78.25 % in 10:40 min
# 15 Epochs with Given Arch + Residual + Adam Optimizer: 90.5 % in 05:18 min
# 15 Epochs with Given Arch + Residual + Adam Optimizer + BatchNorm: 92.97 % in 11:34 min !!! Not bad !!!
# 15 Epochs with Given Arch + Residual + Adam Optimizer + 2 Conv Layers: 90.5 % in 16:35 min
# 15 Epochs with 2XFilters + Residual + Adam Optimizer + 2 Conv Layers + BatchNorm: 93.03 % in 21:21 min
# 15 Epochs with 2XFilters + Residual + Adam Optimizer + 2 Conv Layers + BatchNorm + Dropout&tanh in FC Layer: 92.3 % in 21:28 min
# 15 Epochs with Given Arch + Residual + Adam Optimizer + 2 Conv Layers + BatchNorm + Dropout in FC Layer: 92.1 % in 15:32 min
# 15 Epochs with Given Arch + Residual + Adam Optimizer + BatchNorm + Dropout in FC Layer: 92.79 % in 12:21 min

# 50 Epochs with Given Arch + Residual + Adam Optimizer + BatchNorm: 94.30 % in 40:39 min
# 50 Epochs with 2XFilters + Residual + Adam Optimizer + BatchNorm: 94.12 % in 22.40 min
# 15 Epochs with 2XFilters + Residual + Adam Optimizer + BatchNorm + KernelSize=5: 93.21 % in 7.43 min
###### ---> 50 Epochs with 2XFilters + Residual + Adam Optimizer + BatchNorm + KernelSize=5: 94.43 % in 25.33 min
# 50 Epochs with Given Arch + Residual + Adam Optimizer + BatchNorm + KernelSize=5: 94.00 % in 22:49 min

# 50 Epochs with 4XFilters + Residual + Adam Optimizer + BatchNorm + KernelSize=5: 94.16 % in 42.51 min
# 50 Epochs with 4XFilters + Residual + Adam Optimizer + BatchNorm + KernelSize=5 + 2 Conv Layers: 94.32 % in 55.41 min

# 50 Epochs with 2XFilters + Residual + Adam Optimizer + BatchNorm + KernelSize=5 + 2xNeurons: 94.2 % in 64.19 min




