In [1]:
import numpy as np
TRAIN_SIZE, VAL_SIZE = 50000, 10000
dataset_prefix = "fashionmnist" 
num_classes = 10

train_data = np.load("./hw5_data/{}_train.npy".format(dataset_prefix))
test_data = np.load("./hw5_data/{}_test.npy".format(dataset_prefix))

# Note: I am unable to use sklearn at the moment due to package inconsistencies
# Split train data to train/val/test
train_images = train_data[:TRAIN_SIZE, :]
val_images = train_data[TRAIN_SIZE:, :]

test_images = test_data.reshape(-1, 1, 28, 28)

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image

def imshow(image, title=None):
    fig, ax = plt.subplots(1, figsize=(2,2))
    ax.imshow(image.squeeze(0)*255, cmap='gray')
    if title is not None:
        plt.title(title)

for i in range(10):
    imshow(train_images[i], title='Image')

In [2]:
# PyTorch: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py

import torch
import torch.nn as nn
import torch.nn.functional as F

# CNN ARCHITECTURE: Conv2d(,,5) -> MaxPool2d(2,2) -> ReLU -> Conv2d(,,5) -> MaxPool2d(2,2) -> Linear(,) -> ReLu 
# -> Linear(, 10) -> Softmax

class Net(nn.Module):
    def __init__(self, S=1, P=0, conv1=(1, 10, 5), pool=2, pool_S=1, conv2=(10, 10, 5), fc1=20, fc2=10, drop=0.5, batch=1):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(conv1[0], conv1[1], conv1[2], stride=S, padding=P) # output dim=(BATCH_SIZE x c1 x (24+2P) x (24+2P))
        self.pool = nn.MaxPool2d(pool, stride=pool_S) # output dim=(BATCH_SIZE x c1 x (12+P) x (12+P))
        self.conv2 = nn.Conv2d(conv2[0], conv2[1], conv2[2], stride=1, padding=P) # output dim=(BATCH_SIZE x c2 x (8+3P) x (8+3P))
        self.l = conv2[1] * (4+int(1.5*P)) * (4+int(1.5*P))
        self.fc1 = nn.Linear(self.l, fc1, bias=True) 
        self.fc2 = nn.Linear(fc1, num_classes, bias=True)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(drop)
        self.bn1 = nn.BatchNorm1d(num_features=fc1)
        self.batch = batch
        print(self.batch)
        

    def forward(self, x):
        x = F.relu(self.pool(self.dropout(self.conv1(x)))) # dim=(BATCH_SIZE x c1 x (12+P) x (12+P))
        x = self.pool(self.conv2(x))
        x = x.view(self.batch, self.l)
        x = self.fc1(x) 
        if self.batch > 1:
            x = self.bn1(x)
        x = self.softmax(self.fc2(F.relu(x)))
        
        return x



In [3]:
import pandas as pd
import torch.optim as optim

def train(S, P, conv1, pool, pool_S, conv2, fc1, drop, opt, lr, mom, batch):
    # NN INITIALIZATION
    net = Net(S=S, P=P, conv1=conv1, pool=pool, pool_S=pool_S, conv2=conv2, fc1=fc1, drop=drop, batch=batch)
        
    criterion = nn.CrossEntropyLoss()
    
    if opt == 'sgd':
        optimizer = optim.SGD(net.parameters(), lr=lr, momentum=mom)
    elif opt == 'adam':
        optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9,0.999))
    elif opt == 'nesterov':
        optimizer = optim.SGD(net.parameters(), lr=lr, momentum=mom, nesterov=True)
    
    
    trainloader = torch.utils.data.DataLoader(train_images, batch_size=batch,
                                              shuffle=True, num_workers=2)
    epsilon = 0.002
    losses = []
    vals = []
    for epoch in range(10):  # loop over the dataset multiple times 
        running_loss = 0.0
        for i, data in enumerate(trainloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs = data[:, :-1]
            labels = data[:, -1]
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            if inputs.shape[0] == batch:
                outputs = net(inputs.reshape(batch, 1, 28, 28).float())
                outputs = outputs.reshape(batch, -1)
                loss = criterion(outputs, labels.long())
                loss.backward()
                optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % (50000//(10*batch)) == ((50000//(10*batch))-1):    # print every 1/10 epoch
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / (50000//(10*batch))))
                losses.append(running_loss/(50000//(10*batch)))
                running_loss = 0.0
                
                # val accuracy
                valloader = torch.utils.data.DataLoader(val_images, batch_size=batch,
                                                          shuffle=True, num_workers=2)
                correct = 0
                total = 0
                with torch.no_grad():
                    for data in valloader:
                        images, labels = data[:, :-1], data[:, -1]
                        if images.shape == (batch, 784):
                            outputs = net(images.reshape(batch, 1, 28, 28).float())
                            _, predicted = torch.max(outputs.data, 1)
                            total += labels.size(0)
                            correct += (predicted == labels).sum().item()

                print('Accuracy of the network on the 10000 test images: %d %%' % (
                    100 * correct / total))
                vals.append(100 * correct / total)
                

        if losses[-9] - losses[-1] < epsilon:
            break

    print('Finished Training')
    return net, losses, vals



In [4]:
# Perform random search to tune hyperparameters; more efficient than grid search since it allows for identifying important
# parameters more quickly, and avoids prioritizing less important ones (Stanford CS231)

def random_search(num_iter=1):
    # S = 1 as good practice and simplified dimensional constraints
    # pool = 2, pool_S=2 as convention according to cs231
    # fc2 = 10 (num_labels) 
    pad = np.random.randint(1, 2, size=num_iter)*2 # since filter kernel is 5x5,P in {0,2,4}
    c1 = 2**np.random.randint(9, 10, size=num_iter) # conv1 in [32, 512], kernel is 5x5
    c2 = 2**np.random.randint(9, 10, size=num_iter) # conv2 in [32, 512], kernel is 5x5
    fc1 = 2**np.random.randint(9, 10, size=num_iter) # fc1 in [32, 512]
    opt_l = ['sgd', 'nesterov', 'adam'] # opt in {sgd, nesterov, adam}
    opt = np.random.randint(2, 3, size=num_iter)
    lr = np.random.randint(4, 5, size=num_iter) # lr in {0.001, 0.0001}
    mom = np.random.uniform(0.7, 0.91, size=num_iter) # mom in [0.7, 0.99]
    drop = np.random.uniform(0.4, 0.81, size=num_iter) # dropout in [0.2, 0.8]
    batch = 2**(np.random.randint(6, 7, size=num_iter)) # batch in [16, 64]
    
    for i in range(num_iter):
        net, losses, vals = train(S=1, P=pad[i], conv1=(1, c1[i], 5), pool=2, pool_S=2, conv2=(c1[i], c2[i], 5), 
                            fc1=fc1[i], drop=drop[i], opt=opt_l[opt[i]], lr=0.1**lr[i], mom=mom[i], batch=int(batch[i]))
        print("Trained NN{:}".format(i))
        
        # Save NN
        PATH = 'fashionmnist/fashionmnist_net{:}.pth'.format(i)
        torch.save(net.state_dict(), PATH)
        
        # Save losses
        pd.DataFrame(losses).to_csv("fashionmnist/losses{:}.csv".format(i))
        pd.DataFrame(vals).to_csv("fashionmnist/vals{:}.csv".format(i))
        # Save parameters
        with open('fashionmnist/params{:}.txt'.format(i), 'w+') as f:
            f.write("{:}\n opt: {:}\n lr: {:}\n mom: {:}\n drop: {:}\n batch: {:}".format(net.parameters, opt_l[opt[i]], 
                                                                                          0.1**lr[i], mom[i], drop[i], batch[i]))
        
        print("Saved NN{:}".format(i))


In [934]:
random_search(1)

64
[1,    78] loss: 1.748
Accuracy of the network on the 10000 test images: 81 %
[1,   156] loss: 1.650
Accuracy of the network on the 10000 test images: 84 %
[1,   234] loss: 1.627
Accuracy of the network on the 10000 test images: 85 %
[1,   312] loss: 1.618
Accuracy of the network on the 10000 test images: 85 %
[1,   390] loss: 1.610
Accuracy of the network on the 10000 test images: 87 %
[1,   468] loss: 1.608
Accuracy of the network on the 10000 test images: 87 %
[1,   546] loss: 1.592
Accuracy of the network on the 10000 test images: 87 %
[1,   624] loss: 1.593
Accuracy of the network on the 10000 test images: 87 %
[1,   702] loss: 1.590
Accuracy of the network on the 10000 test images: 88 %
[1,   780] loss: 1.583
Accuracy of the network on the 10000 test images: 88 %
[2,    78] loss: 1.581
Accuracy of the network on the 10000 test images: 88 %
[2,   156] loss: 1.578
Accuracy of the network on the 10000 test images: 89 %
[2,   234] loss: 1.567
Accuracy of the network on the 10000 t

In [897]:
# TRAINING ON VALIDATION
BATCH=64
valloader = torch.utils.data.DataLoader(val_images, batch_size=BATCH,
                                              shuffle=True, num_workers=2)
epsilon = 0.005
losses = []
vals = []
for epoch in range(2):  # loop over the dataset multiple times 
    running_loss = 0.0
    for i, data in enumerate(valloader):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[:, :-1]
        labels = data[:, -1]
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        if inputs.shape[0] == batch:
            outputs = net(inputs.reshape(batch, 1, 28, 28).float())
            outputs = outputs.reshape(batch, -1)
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % (10000//(10*batch)) == ((10000//(10*batch))-1):    # print every 1/10 epoch
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / (10000//(10*batch))))
            losses.append(running_loss/(10000//(10*batch)))
            running_loss = 0.0
                
    if losses[-9] - losses[-1] < epsilon:
        break

print('Finished Training')
print("Trained NN{:}".format(i))
        
# Save NN
PATH = 'fashionmnist/fashionmnist_net{:}.pth'.format(100)
torch.save(net.state_dict(), PATH)
        
# Save losses
pd.DataFrame(losses).to_csv("fashionmnist/losses{:}.csv".format(100))
pd.DataFrame(vals).to_csv("fashionmnist/vals{:}.csv".format(100))
        
print("Saved NN{:}".format(i))

16
Accuracy of the network on the 10000 test images: 71 %


In [18]:
# Load NN
PATH = 'fashionmnist/fashionmnist_net{:}.pth'.format(11)
BATCH = 64

net = Net(S=1, P=0, conv1=(1,32,5), pool=2, pool_S=2, conv2=(32,512,5), fc1=128, drop=0.3539679254488697, batch=100)
net.load_state_dict(torch.load(PATH))

# predictions on test set
testloader = torch.utils.data.DataLoader(test_images, batch_size=100, shuffle=False, num_workers=2)

predictions = []
with torch.no_grad():
    for i, data in enumerate(testloader):
        outputs = net(data.reshape(100, 1, 28, 28).float())
        predictions.extend(torch.max(outputs.data, 1)[1])
        print(i*100)
        

pd.DataFrame(predictions).to_csv("predictions2.csv")

100
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
