# Prep: 

In [0]:
from google.colab import drive
drive.mount('/ME')

Drive already mounted at /ME; to attempt to forcibly remount, call drive.mount("/ME", force_remount=True).


In [0]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
datadir='/ME/My Drive/LSDA_data/'
print(torch.__version__)
torch.cuda.is_available()

1.5.0+cu101


True

In [0]:
def get_mnist():
    data=np.float64(np.load(datadir+'mnist/MNIST.npy'))
    labels=np.float32(np.load(datadir+'mnist/MNIST_labels.npy'))
    print(data.shape)
    data=np.float32(data)/255.
    train_dat=data[0:50000].reshape((-1,1,28,28))
    train_labels=np.int32(labels[0:50000])
    val_dat=data[50000:60000].reshape((-1,1,28,28))
    val_labels=np.int32(labels[50000:60000])
    test_dat=data[60000:70000].reshape((-1,1,28,28))
    test_labels=np.int32(labels[60000:70000])
    return (train_dat, train_labels), (val_dat, val_labels), (test_dat, test_labels)

def get_mnist_transformed():
    data=np.float64(np.load(datadir+'mnist/MNIST_TR.npy')) 
    labels=np.float32(np.load(datadir+'mnist/MNIST_labels.npy'))
    print(data.shape)
    data=np.float32(data)/255.
    train_dat=data[0:50000].reshape((-1,1,28,28)) 
    train_labels=np.int32(labels[0:50000]) 
    val_dat=data[50000:60000].reshape((-1,1,28,28)) 
    val_labels=np.int32(labels[50000:60000]) 
    test_dat=data[60000:70000].reshape((-1,1,28,28)) 
    test_labels=np.int32(labels[60000:70000])
    return (train_dat, train_labels), (val_dat, val_labels), (test_dat, test_labels)


def get_letters():
    data=np.float64(np.load(datadir+'letters_data.npy'))
    data=np.float32(data)/255.
    data = data.reshape((-1,1,28,28))
    return (data)

def get_data(data_set):
    if (data_set=="mnist"):
        return(get_mnist())
    if (data_set=='letters'):
      return (get_letters())
    if (data_set=='trans'):
      return(get_mnist_transformed())

In [0]:
class N0_Net(nn.Module):
    def __init__(self,p=0.5,minimizer='Adam'):
        super(N0_Net, self).__init__()
        # 32 output features using 5x5 kernel applied to input image
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        # 64 output features using 5x5 kernel applied to 32 features of previous layer.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        # Dropout - zero out some output features so weights aren't updated.
        self.conv2_drop = nn.Dropout2d(p)
        # 64 x 4 x 4 = 1024 units total in final spartial layer fully connected to 256 unit later
        self.fc1 = nn.Linear(1024, 256)
        # Last layer has 10 units for 10 classes
        self.fc2 = nn.Linear(256, 10)
        if minimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.parameters(), lr = step_size)
        else:
            self.optimizer = torch.optim.SGD(self.parameters(), lr = step_size, momentum=0.9)
        self.first=True
        # negative log-likelihood loss 
        self.criterion=nn.CrossEntropyLoss()
            
    def forward(self, x):

        # Apply first conv then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        # Apply second conv then drop, then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        # Reshape 64 x 4 x 4 to 1024 units
        x = x.view(-1, 1024)
        # Apply fully connected layer with non-linearity relu
        x = F.relu(self.fc1(x))
        # Another dropout
        x = F.dropout(x, training=self.training)
        # Final 10 unit logits layer
        x = self.fc2(x)
        return x
    
    def get_acc_and_loss(self, data, targ):
        # Apply network to batch input
        output = self.forward(data)
        # Comput loss between logit output and targ (correct class labels)
        loss = self.criterion(output, targ)
        # Also compute correct classification rate
        pred = torch.max(output,1)[1]
        correct = torch.eq(pred,targ).sum()
        
        return loss,correct
        
    def run_grad(self,data,targ):

        # Compute loss and accuracy
        loss, correct=self.get_acc_and_loss(data,targ)
        # Zero out gradients
        self.optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters based on gradients
        self.optimizer.step()
        
        return loss, correct
    
        

# Problem 3

## (a): Make change to the original network

I remove the fc1 and fc2 fully connected layers and replace them with fc64 layer and name the network class N1_Net. To avoid repeating codes, I also changed the loss functions and gradient descent here. The detailed code for loss function is on part (c). 

In [0]:
class N1_Net(nn.Module):
    def __init__(self,p=0.5,minimizer='Adam'):
        super(N1_Net, self).__init__()
        # 32 output features using 5x5 kernel applied to input image
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        # 64 output features using 5x5 kernel applied to 32 features of previous layer.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        # Dropout - zero out some output features so weights aren't updated.
        self.conv2_drop = nn.Dropout2d(p)
        # 64 x 4 x 4 = 1024 units total in final spartial layer fully connected to 64 unit later
        self.fc64 = nn.Linear(1024, 64)
        if minimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.parameters(), lr = step_size)
        else:
            self.optimizer = torch.optim.SGD(self.parameters(), lr = step_size, momentum=0.9)
        self.first=True
        # negative log-likelihood loss 
        self.criterion=nn.CrossEntropyLoss()
            
    def forward(self, x):

        # Apply first conv then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        # Apply second conv then drop, then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        # Reshape 64 x 4 x 4 to 1024 units
        x = x.view(-1, 1024)
        # Apply fully connected layer with non-linearity relu, final 64 units 
        x = self.fc64(x)
        return x
    
    def get_acc_and_loss(self, data):
        # Apply network to batch input
        o = self.forward(data)
        affined = affine(data)
        o_tilde = self.forward(affined)
        # Comput loss between logit output and targ (correct class labels)
        loss = NT_Xent()(o,o_tilde)
        # Also compute correct classification rate        
        return loss
        
    def run_grad(self,data):

        # Compute loss and accuracy
        loss =self.get_acc_and_loss(data)
        # Zero out gradients
        self.optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters based on gradients
        self.optimizer.step()
        
        return loss

## (b): Write function to augment data at each batch

To limit the transformation to scaling and shifting, I set the parameters related to shearing to 0, and restricts the parameters relatted to scaling to positive numbers to prevent reflection. Any flipping or rotation is not appropriate here because some letters such as "p" and "q" are reflections of each other, and if we feed them into the algorithm for contrastive learning the model will learn to not distinguish between "p" and "q". 

In [0]:
def affine(x_in, factor=3):
  nn = x_in.shape[0]
  h = x_in.shape[2]
  w = x_in.shape[3]
  # for each batch element sample 6 random parameters 
  # for an affine transformation of the grid: 
  # factor controls the deviation from identity.
  u = ((torch.rand(nn,6)-0.5) * factor).to(device)
  u[:,0] = abs(u[:,0])
  u[:,4] = abs(u[:,4])
  u[:,1] = 0
  u[:,3] = 0
  # Add this random vector to this identity affine map:
  ID = torch.zeros(nn,6).to(device)
  ID[:,0] = 1
  ID[:,4] = 1
  theta = (u+ID).reshape(-1,2,3)
  # Creates the mapping of the deformed grid
  grid = F.affine_grid(theta, [nn,1,h,w] ,align_corners=True)
  # Applies this mapping to the image
  x_out = F.grid_sample(x_in, grid, padding_mode = 'border',align_corners=True)
  return x_out

## (c) Implement the loss function from the paper

In [0]:
class NT_Xent(nn.Module):
  def __init__(self):
    super(NT_Xent,self).__init__()
  
  def forward(self,o,o_tilde):
    # normalize feature vectors
    o = F.normalize(o,dim=1)
    o_tilde = F.normalize(o_tilde,dim=1)
    # Compute similarity matrix
    uv = torch.cat([o,o_tilde],dim=0)
    uv_t = torch.t(uv)
    vu = torch.cat([o,o_tilde],dim=0)
    sim_mat = torch.exp(torch.div(torch.mm(uv,uv_t),tau))
    # Compute softmax 
    sim_sums = torch.sum(sim_mat,dim=1)
    self_sim = torch.diag(sim_mat)
    denoms = sim_sums - self_sim
    nums= torch.exp(torch.div(torch.nn.CosineSimilarity()(uv,vu),tau))
    softmaxes = torch.div(nums,denoms)
    # Compute negative loss 
    neglog_losses = -torch.log(softmaxes)
    loss = torch.mean(neglog_losses)
    return loss

# (d) Implement stochastic gradient descent on this data and save the resulting network N1

I implement the stochastic gradient descent on this data by making changes to the class methods of the neural net, specifically get_acc_and_loss. The changes are reflected in question part (a), where I defined the class N1_Net. I also changed some settings for the function run_epoch. 


In [0]:
def run_epoch_simCLR(net,epoch,train,batch_size, num=None, ttype="train"):
    # Model is being trained dropout is applied
    net.train()
    if ttype=='train':
        t1=time.time()
        n=train.shape[0]
        if (num is not None):
            n=np.minimum(n,num)
        ii = np.array(np.arange(0,n,1))
        tr=train[ii]
        train_loss=0 
        with tqdm(total=n) as progress_bar:
            for j in np.arange(0,n,batch_size):
              # Transfer batch data to device (cpu or gpu)
                data=torch.from_numpy(tr[j:j+batch_size]).to(device)
              # Compute gradients, update params and report loss and correct
                loss  = net.run_grad(data) 
                train_loss += loss.item()                
                progress_bar.set_postfix(loss=loss.item())
                progress_bar.update(data.size(0))
        train_loss /= 2*n
        print('\nTraining set epoch {}: Avg. loss: {:.4f}'.format(epoch,train_loss))
        return (train_loss)

In [0]:
import time

# Some parameters>
batch_size=500
step_size=.001
num_epochs=20
numtrain=124800
minimizer="Adam"
data_set="letters"
model_name="N1"
dropout_p=0.5
dim=28
nchannels=1
use_gpu=True
tau = 0.1

# use GPU when possible
device = 'cuda:0' if torch.cuda.is_available() and use_gpu else 'cpu'
print(device)
# get data
train = get_data(data_set=data_set)

cuda:0


In [0]:
# Initialize the model
net = N1_Net(p = dropout_p, minimizer=minimizer)
net.to(device)
#define optimizer

# Run epochs
train_err = []
for i in range(num_epochs):
    train_err.append(run_epoch_simCLR(net,i,train,batch_size, num=numtrain, ttype="train"))
# Save model
torch.save(net.state_dict(), datadir+model_name)

100%|██████████| 124800/124800 [00:04<00:00, 28130.77it/s, loss=-2.76]



Training set epoch 0: Avg. loss: -0.0021


100%|██████████| 124800/124800 [00:04<00:00, 28870.65it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 15084.17it/s, loss=-2.27]


Training set epoch 1: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28970.26it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 15235.95it/s, loss=-2.27]


Training set epoch 2: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28899.39it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 15333.08it/s, loss=-2.26]


Training set epoch 3: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28834.89it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 14797.96it/s, loss=-2.28]


Training set epoch 4: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28995.70it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:07, 15705.36it/s, loss=-2.29]


Training set epoch 5: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28711.45it/s, loss=-2.81]
  0%|          | 500/124800 [00:00<00:08, 15091.77it/s, loss=-2.28]


Training set epoch 6: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28609.68it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 14802.03it/s, loss=-2.29]


Training set epoch 7: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28494.88it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 15209.87it/s, loss=-2.28]


Training set epoch 8: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28439.96it/s, loss=-2.81]
  0%|          | 500/124800 [00:00<00:08, 15228.31it/s, loss=-2.29]


Training set epoch 9: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28415.38it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:07, 15840.95it/s, loss=-2.29]


Training set epoch 10: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28298.09it/s, loss=-2.81]
  0%|          | 500/124800 [00:00<00:07, 15556.35it/s, loss=-2.29]


Training set epoch 11: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28234.95it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:07, 15616.47it/s, loss=-2.3]


Training set epoch 12: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28325.97it/s, loss=-2.82]
  0%|          | 500/124800 [00:00<00:08, 15355.20it/s, loss=-2.3]


Training set epoch 13: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28357.41it/s, loss=-2.81]
  0%|          | 500/124800 [00:00<00:07, 15689.14it/s, loss=-2.29]


Training set epoch 14: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28534.57it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 15056.77it/s, loss=-2.3]


Training set epoch 15: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28693.47it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 15127.04it/s, loss=-2.3]


Training set epoch 16: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28522.71it/s, loss=-2.82]
  0%|          | 500/124800 [00:00<00:08, 15089.49it/s, loss=-2.3]


Training set epoch 17: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28685.11it/s, loss=-2.81]
  0%|          | 500/124800 [00:00<00:08, 15172.35it/s, loss=-2.3]


Training set epoch 18: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28751.68it/s, loss=-2.81]



Training set epoch 19: Avg. loss: -0.0023


## (e) Define new network N2

Because now we need to train the network on handwritten digits with supervised algorithm using softmax loss, I will make alterations to the run_one_epoch function as well.

In [0]:
class N2_Net(nn.Module):
    def __init__(self,p=0.5,minimizer='Adam'):
        super(N2_Net, self).__init__()
        # 32 output features using 5x5 kernel applied to input image
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        # 64 output features using 5x5 kernel applied to 32 features of previous layer.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        # Dropout - zero out some output features so weights aren't updated.
        self.conv2_drop = nn.Dropout2d(p)
        # 64 x 4 x 4 = 1024 units total in final spartial layer fully connected to 10 unit later
        self.fc10 = nn.Linear(1024, 10)
        if minimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.parameters(), lr = step_size)
        else:
            self.optimizer = torch.optim.SGD(self.parameters(), lr = step_size, momentum=0.9)
        self.first=True
        # negative log-likelihood loss 
        self.criterion=nn.CrossEntropyLoss()
            
    def forward(self, x):

        # Apply first conv then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        # Apply second conv then drop, then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        # Reshape 64 x 4 x 4 to 1024 units
        x = x.view(-1, 1024)
        # Apply fully connected layer with non-linearity relu
        x = self.fc10(x)
        return x
    
    def get_acc_and_loss(self, data, targ):
        # Apply network to batch input
        output = self.forward(data)
        # Comput loss between logit output and targ (correct class labels)
        loss = self.criterion(output, targ)
        # Also compute correct classification rate
        pred = torch.max(output,1)[1]
        correct = torch.eq(pred,targ).sum()
        
        return loss,correct
        
    def run_grad(self,data,targ):

        # Compute loss and accuracy
        loss, correct=self.get_acc_and_loss(data,targ)
        # Zero out gradients
        self.optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters based on gradients
        self.optimizer.step()
        
        return loss, correct
    

In [0]:
model_name = "N1"
N1 = N1_Net(p = dropout_p, minimizer=minimizer)
N1.to(device)
state_dict = torch.load(datadir+model_name, map_location = device)
N1.load_state_dict(state_dict)

<All keys matched successfully>

In [0]:
N2 = N2_Net(p=dropout_p,minimizer = minimizer)
N2.to(device)
# Get parameter sets of both networks
params = N1.named_parameters()
params2 = N2.named_parameters()
# Make a dictionary of the new one
dict_params2 = dict(params2)
# Loop over parameters of N1
for name, param in params:
  if name in dict_params2:
    dict_params2[name].data.copy_(param.data)
    
N2.load_state_dict(dict_params2)

<All keys matched successfully>

In [0]:
PP = []
for name, param in N2.named_parameters():
  print(name,param.shape)
  if 'fc10' in name:
    PP.append(param)
N2.optimizer = torch.optim.Adam(PP,lr=step_size)

conv1.weight torch.Size([32, 1, 5, 5])
conv1.bias torch.Size([32])
conv2.weight torch.Size([64, 32, 5, 5])
conv2.bias torch.Size([64])
fc10.weight torch.Size([10, 1024])
fc10.bias torch.Size([10])


In [0]:
def run_epoch(net,epoch,train,batch_size, num=None, ttype="train"):
    
    # Model is being trained dropout is applied
    net.train()
    if ttype=='train':
        t1=time.time()
        n=train[0].shape[0]
        if (num is not None):
            n=np.minimum(n,num)
        ii=np.array(np.arange(0,n,1))
        tr=train[0][ii]
        y=train[1][ii]
        train_loss=0; train_correct=0
        with tqdm(total=len(y)) as progress_bar:
            for j in np.arange(0,len(y),batch_size):
              # Transfer batch data to device (cpu or gpu)
                data=torch.from_numpy(tr[j:j+batch_size]).to(device)
                targ=torch.from_numpy(y[j:j+batch_size]).type(torch.long).to(device)
              # Compute gradients, update params and report loss and correct
                loss, correct = net.run_grad(data,targ) 
                
                train_loss += loss.item()
                train_correct += correct.item()
                
                progress_bar.set_postfix(loss=loss.item())
                progress_bar.update(data.size(0))
        train_loss /= len(y)
        print('\nTraining set epoch {}: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(epoch,
            train_loss, train_correct, len(y),
            100. * train_correct / len(y)))

In [0]:
def net_test(net,val,batch_size,ttype='val'):

    # Do not apply dropout or gradients.
    net.eval()
    with torch.no_grad():
                test_loss = 0
                test_correct = 0
                vald=val[0]
                yval=val[1]
                for j in np.arange(0,len(yval),batch_size):
                    data=torch.torch.from_numpy(vald[j:j+batch_size]).to(device)
                    targ = torch.torch.from_numpy(yval[j:j+batch_size]).type(torch.long).to(device)
                    loss,correct=net.get_acc_and_loss(data,targ)

                    test_loss += loss.item()
                    test_correct += correct.item()

                test_loss /= len(yval)
                SSS='Validation'
                if (ttype=='test'):
                    SSS='Test'
                print('\n{} set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(SSS,
                    test_loss, test_correct, len(yval),
                    100. * test_correct / len(yval)))

In [0]:
numtrain=50000
data_set="mnist"
model_name="N2"
# get data
train,val,test=get_data(data_set=data_set)

# Run epochs
for i in range(num_epochs):
    run_epoch(N2,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2,val,batch_size)
# Test on test set.
net_test(N2,test,batch_size,ttype='test')
# Save model
torch.save(N2.state_dict(), datadir+model_name)

  6%|▌         | 3000/50000 [00:00<00:01, 36227.85it/s, loss=2.14]

(70000, 784)


100%|██████████| 50000/50000 [00:00<00:00, 54179.94it/s, loss=1.03]
 12%|█▏        | 6000/50000 [00:00<00:00, 56379.35it/s, loss=0.807]


Training set epoch 0: Avg. loss: 0.0028, Accuracy: 37809/50000 (76%)


Validation set: Avg. loss: 0.0016, Accuracy: 8988/10000 (90%)



100%|██████████| 50000/50000 [00:00<00:00, 57921.94it/s, loss=0.738]
 12%|█▏        | 6000/50000 [00:00<00:00, 56934.97it/s, loss=0.535]


Training set epoch 1: Avg. loss: 0.0014, Accuracy: 43467/50000 (87%)


Validation set: Avg. loss: 0.0010, Accuracy: 9204/10000 (92%)



100%|██████████| 50000/50000 [00:00<00:00, 57474.72it/s, loss=0.601]
 12%|█▏        | 6000/50000 [00:00<00:00, 57257.26it/s, loss=0.444]


Training set epoch 2: Avg. loss: 0.0011, Accuracy: 44360/50000 (89%)


Validation set: Avg. loss: 0.0008, Accuracy: 9289/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 56396.55it/s, loss=0.538]
 12%|█▏        | 6000/50000 [00:00<00:00, 57273.28it/s, loss=0.376]


Training set epoch 3: Avg. loss: 0.0009, Accuracy: 44852/50000 (90%)


Validation set: Avg. loss: 0.0006, Accuracy: 9362/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 58622.32it/s, loss=0.533]
 12%|█▏        | 6000/50000 [00:00<00:00, 59303.15it/s, loss=0.332]


Training set epoch 4: Avg. loss: 0.0008, Accuracy: 45273/50000 (91%)


Validation set: Avg. loss: 0.0006, Accuracy: 9416/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 57019.79it/s, loss=0.459]
 12%|█▏        | 6000/50000 [00:00<00:00, 57697.28it/s, loss=0.311]


Training set epoch 5: Avg. loss: 0.0007, Accuracy: 45396/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9456/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 56889.97it/s, loss=0.459]
 13%|█▎        | 6500/50000 [00:00<00:00, 60435.18it/s, loss=0.298]


Training set epoch 6: Avg. loss: 0.0007, Accuracy: 45736/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9484/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 57849.12it/s, loss=0.428]
 12%|█▏        | 6000/50000 [00:00<00:00, 58950.58it/s, loss=0.264]


Training set epoch 7: Avg. loss: 0.0006, Accuracy: 45894/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9505/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 60550.95it/s, loss=0.403]
 12%|█▏        | 6000/50000 [00:00<00:00, 56962.03it/s, loss=0.258]


Training set epoch 8: Avg. loss: 0.0006, Accuracy: 45873/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9521/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 56317.18it/s, loss=0.377]
 12%|█▏        | 6000/50000 [00:00<00:00, 55604.02it/s, loss=0.28]


Training set epoch 9: Avg. loss: 0.0006, Accuracy: 46035/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9541/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 57367.34it/s, loss=0.368]
 12%|█▏        | 6000/50000 [00:00<00:00, 56690.76it/s, loss=0.239]


Training set epoch 10: Avg. loss: 0.0006, Accuracy: 46095/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9558/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 59173.82it/s, loss=0.385]
 12%|█▏        | 6000/50000 [00:00<00:00, 56305.93it/s, loss=0.234]


Training set epoch 11: Avg. loss: 0.0005, Accuracy: 46236/50000 (92%)


Validation set: Avg. loss: 0.0003, Accuracy: 9566/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 57972.38it/s, loss=0.345]
 12%|█▏        | 6000/50000 [00:00<00:00, 56298.25it/s, loss=0.227]


Training set epoch 12: Avg. loss: 0.0005, Accuracy: 46333/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9582/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58728.72it/s, loss=0.346]
 14%|█▍        | 7000/50000 [00:00<00:00, 62989.32it/s, loss=0.281]


Training set epoch 13: Avg. loss: 0.0005, Accuracy: 46361/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9584/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 59245.05it/s, loss=0.353]
 13%|█▎        | 6500/50000 [00:00<00:00, 62676.82it/s, loss=0.215]


Training set epoch 14: Avg. loss: 0.0005, Accuracy: 46468/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9600/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 57378.56it/s, loss=0.336]
 12%|█▏        | 6000/50000 [00:00<00:00, 57290.76it/s, loss=0.213]


Training set epoch 15: Avg. loss: 0.0005, Accuracy: 46468/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9607/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 57189.64it/s, loss=0.328]
 12%|█▏        | 6000/50000 [00:00<00:00, 54938.49it/s, loss=0.203]


Training set epoch 16: Avg. loss: 0.0005, Accuracy: 46605/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9613/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 57387.29it/s, loss=0.337]
 13%|█▎        | 6500/50000 [00:00<00:00, 61579.93it/s, loss=0.239]


Training set epoch 17: Avg. loss: 0.0005, Accuracy: 46451/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9622/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58590.09it/s, loss=0.304]
 13%|█▎        | 6500/50000 [00:00<00:00, 60286.44it/s, loss=0.203]


Training set epoch 18: Avg. loss: 0.0005, Accuracy: 46519/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9630/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58650.09it/s, loss=0.318]



Training set epoch 19: Avg. loss: 0.0005, Accuracy: 46606/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9628/10000 (96%)


Test set: Avg. loss: 0.0003, Accuracy: 9633/10000 (96%)



I get about 96% testing accuracy, which is pretty good considering that N2 only trains parameters in fc10 layer without fine-tuning the convolutional layer parameters on handwritten digits. 

# Problem 4: Verify that N2 generalizes better

First, I run N2 on a small training set of 1000 digits total. 

In [0]:
# Some parameters>
numtrain=1000
model_name="N2_small"
# Run epochs
for i in range(num_epochs):
    run_epoch(N2,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2,val,batch_size)
# Test on test set.
net_test(N2,test,batch_size,ttype='test')
# Save model
torch.save(N2.state_dict(), datadir+model_name)

100%|██████████| 1000/1000 [00:00<00:00, 40096.59it/s, loss=0.253]
100%|██████████| 1000/1000 [00:00<00:00, 37143.37it/s, loss=0.226]
100%|██████████| 1000/1000 [00:00<00:00, 37680.27it/s, loss=0.261]



Training set epoch 0: Avg. loss: 0.0005, Accuracy: 937/1000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9628/10000 (96%)


Training set epoch 1: Avg. loss: 0.0004, Accuracy: 941/1000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9628/10000 (96%)


Training set epoch 2: Avg. loss: 0.0005, Accuracy: 933/1000 (93%)



100%|██████████| 1000/1000 [00:00<00:00, 46828.68it/s, loss=0.249]
100%|██████████| 1000/1000 [00:00<00:00, 59927.19it/s, loss=0.247]
100%|██████████| 1000/1000 [00:00<00:00, 54239.03it/s, loss=0.237]


Validation set: Avg. loss: 0.0003, Accuracy: 9628/10000 (96%)


Training set epoch 3: Avg. loss: 0.0004, Accuracy: 938/1000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9630/10000 (96%)


Training set epoch 4: Avg. loss: 0.0005, Accuracy: 942/1000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9627/10000 (96%)


Training set epoch 5: Avg. loss: 0.0005, Accuracy: 939/1000 (94%)




100%|██████████| 1000/1000 [00:00<00:00, 51312.12it/s, loss=0.254]
100%|██████████| 1000/1000 [00:00<00:00, 57172.71it/s, loss=0.255]
100%|██████████| 1000/1000 [00:00<00:00, 55172.24it/s, loss=0.241]


Validation set: Avg. loss: 0.0003, Accuracy: 9628/10000 (96%)


Training set epoch 6: Avg. loss: 0.0004, Accuracy: 946/1000 (95%)


Validation set: Avg. loss: 0.0003, Accuracy: 9627/10000 (96%)


Training set epoch 7: Avg. loss: 0.0005, Accuracy: 929/1000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9627/10000 (96%)


Training set epoch 8: Avg. loss: 0.0004, Accuracy: 940/1000 (94%)




100%|██████████| 1000/1000 [00:00<00:00, 53841.47it/s, loss=0.246]
100%|██████████| 1000/1000 [00:00<00:00, 57032.77it/s, loss=0.243]
100%|██████████| 1000/1000 [00:00<00:00, 53755.21it/s, loss=0.248]


Validation set: Avg. loss: 0.0003, Accuracy: 9625/10000 (96%)


Training set epoch 9: Avg. loss: 0.0004, Accuracy: 932/1000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9626/10000 (96%)


Training set epoch 10: Avg. loss: 0.0004, Accuracy: 947/1000 (95%)


Validation set: Avg. loss: 0.0003, Accuracy: 9627/10000 (96%)


Training set epoch 11: Avg. loss: 0.0004, Accuracy: 941/1000 (94%)




100%|██████████| 1000/1000 [00:00<00:00, 57209.36it/s, loss=0.237]
100%|██████████| 1000/1000 [00:00<00:00, 57538.19it/s, loss=0.223]
100%|██████████| 1000/1000 [00:00<00:00, 58842.65it/s, loss=0.236]


Validation set: Avg. loss: 0.0003, Accuracy: 9629/10000 (96%)


Training set epoch 12: Avg. loss: 0.0004, Accuracy: 949/1000 (95%)


Validation set: Avg. loss: 0.0003, Accuracy: 9625/10000 (96%)


Training set epoch 13: Avg. loss: 0.0004, Accuracy: 947/1000 (95%)


Validation set: Avg. loss: 0.0003, Accuracy: 9621/10000 (96%)


Training set epoch 14: Avg. loss: 0.0004, Accuracy: 938/1000 (94%)




100%|██████████| 1000/1000 [00:00<00:00, 56498.07it/s, loss=0.218]
100%|██████████| 1000/1000 [00:00<00:00, 46302.41it/s, loss=0.255]
100%|██████████| 1000/1000 [00:00<00:00, 49640.26it/s, loss=0.256]


Validation set: Avg. loss: 0.0003, Accuracy: 9618/10000 (96%)


Training set epoch 15: Avg. loss: 0.0004, Accuracy: 954/1000 (95%)


Validation set: Avg. loss: 0.0003, Accuracy: 9616/10000 (96%)


Training set epoch 16: Avg. loss: 0.0004, Accuracy: 940/1000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9614/10000 (96%)


Training set epoch 17: Avg. loss: 0.0004, Accuracy: 936/1000 (94%)




100%|██████████| 1000/1000 [00:00<00:00, 55784.22it/s, loss=0.211]
100%|██████████| 1000/1000 [00:00<00:00, 48971.99it/s, loss=0.215]



Validation set: Avg. loss: 0.0003, Accuracy: 9616/10000 (96%)


Training set epoch 18: Avg. loss: 0.0004, Accuracy: 951/1000 (95%)


Validation set: Avg. loss: 0.0003, Accuracy: 9615/10000 (96%)


Training set epoch 19: Avg. loss: 0.0004, Accuracy: 943/1000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9615/10000 (96%)


Test set: Avg. loss: 0.0003, Accuracy: 9635/10000 (96%)



The test set accuracy does not change and stays at 96% which is very high. 

Next, I run N0 on the same 1000 digits

In [0]:
model_name="N0_small"
# Initialize the model
N0 = N0_Net(p = dropout_p, minimizer=minimizer)
N0.to(device)
# Run epochs
for i in range(num_epochs):
    run_epoch(N0,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N0,val,batch_size)
# Test on test set.
net_test(N0,test,batch_size,ttype='test')
# Save model
torch.save(N0.state_dict(), datadir+'models/'+model_name)

100%|██████████| 1000/1000 [00:00<00:00, 35427.86it/s, loss=2.28]
100%|██████████| 1000/1000 [00:00<00:00, 38242.33it/s, loss=2.19]
100%|██████████| 1000/1000 [00:00<00:00, 37945.14it/s, loss=2.05]



Training set epoch 0: Avg. loss: 0.0046, Accuracy: 132/1000 (13%)


Validation set: Avg. loss: 0.0045, Accuracy: 5233/10000 (52%)


Training set epoch 1: Avg. loss: 0.0044, Accuracy: 310/1000 (31%)


Validation set: Avg. loss: 0.0042, Accuracy: 4757/10000 (48%)


Training set epoch 2: Avg. loss: 0.0042, Accuracy: 377/1000 (38%)



100%|██████████| 1000/1000 [00:00<00:00, 46819.79it/s, loss=1.84]
100%|██████████| 1000/1000 [00:00<00:00, 50330.64it/s, loss=1.55]
100%|██████████| 1000/1000 [00:00<00:00, 49738.57it/s, loss=1.27]


Validation set: Avg. loss: 0.0038, Accuracy: 6031/10000 (60%)


Training set epoch 3: Avg. loss: 0.0037, Accuracy: 522/1000 (52%)


Validation set: Avg. loss: 0.0032, Accuracy: 7287/10000 (73%)


Training set epoch 4: Avg. loss: 0.0032, Accuracy: 597/1000 (60%)


Validation set: Avg. loss: 0.0026, Accuracy: 7740/10000 (77%)


Training set epoch 5: Avg. loss: 0.0026, Accuracy: 662/1000 (66%)




100%|██████████| 1000/1000 [00:00<00:00, 53387.78it/s, loss=1.06]
100%|██████████| 1000/1000 [00:00<00:00, 50552.05it/s, loss=0.856]
100%|██████████| 1000/1000 [00:00<00:00, 53951.58it/s, loss=0.791]


Validation set: Avg. loss: 0.0020, Accuracy: 7801/10000 (78%)


Training set epoch 6: Avg. loss: 0.0021, Accuracy: 715/1000 (72%)


Validation set: Avg. loss: 0.0015, Accuracy: 8056/10000 (81%)


Training set epoch 7: Avg. loss: 0.0017, Accuracy: 741/1000 (74%)


Validation set: Avg. loss: 0.0012, Accuracy: 8299/10000 (83%)


Training set epoch 8: Avg. loss: 0.0015, Accuracy: 761/1000 (76%)




100%|██████████| 1000/1000 [00:00<00:00, 54724.49it/s, loss=0.687]
100%|██████████| 1000/1000 [00:00<00:00, 51938.63it/s, loss=0.603]
100%|██████████| 1000/1000 [00:00<00:00, 51161.90it/s, loss=0.509]


Validation set: Avg. loss: 0.0011, Accuracy: 8365/10000 (84%)


Training set epoch 9: Avg. loss: 0.0013, Accuracy: 788/1000 (79%)


Validation set: Avg. loss: 0.0010, Accuracy: 8424/10000 (84%)


Training set epoch 10: Avg. loss: 0.0012, Accuracy: 812/1000 (81%)


Validation set: Avg. loss: 0.0009, Accuracy: 8576/10000 (86%)


Training set epoch 11: Avg. loss: 0.0010, Accuracy: 841/1000 (84%)




100%|██████████| 1000/1000 [00:00<00:00, 51503.03it/s, loss=0.451]
100%|██████████| 1000/1000 [00:00<00:00, 50444.44it/s, loss=0.459]
100%|██████████| 1000/1000 [00:00<00:00, 53276.56it/s, loss=0.451]


Validation set: Avg. loss: 0.0009, Accuracy: 8691/10000 (87%)


Training set epoch 12: Avg. loss: 0.0009, Accuracy: 856/1000 (86%)


Validation set: Avg. loss: 0.0008, Accuracy: 8808/10000 (88%)


Training set epoch 13: Avg. loss: 0.0009, Accuracy: 874/1000 (87%)


Validation set: Avg. loss: 0.0007, Accuracy: 8898/10000 (89%)


Training set epoch 14: Avg. loss: 0.0008, Accuracy: 870/1000 (87%)




100%|██████████| 1000/1000 [00:00<00:00, 52356.16it/s, loss=0.392]
100%|██████████| 1000/1000 [00:00<00:00, 51213.13it/s, loss=0.373]
100%|██████████| 1000/1000 [00:00<00:00, 50751.46it/s, loss=0.343]


Validation set: Avg. loss: 0.0007, Accuracy: 8962/10000 (90%)


Training set epoch 15: Avg. loss: 0.0008, Accuracy: 883/1000 (88%)


Validation set: Avg. loss: 0.0007, Accuracy: 8984/10000 (90%)


Training set epoch 16: Avg. loss: 0.0007, Accuracy: 895/1000 (90%)


Validation set: Avg. loss: 0.0006, Accuracy: 9079/10000 (91%)


Training set epoch 17: Avg. loss: 0.0007, Accuracy: 899/1000 (90%)




100%|██████████| 1000/1000 [00:00<00:00, 50246.23it/s, loss=0.324]
100%|██████████| 1000/1000 [00:00<00:00, 58231.58it/s, loss=0.311]



Validation set: Avg. loss: 0.0006, Accuracy: 9137/10000 (91%)


Training set epoch 18: Avg. loss: 0.0006, Accuracy: 915/1000 (92%)


Validation set: Avg. loss: 0.0006, Accuracy: 9157/10000 (92%)


Training set epoch 19: Avg. loss: 0.0006, Accuracy: 919/1000 (92%)


Validation set: Avg. loss: 0.0005, Accuracy: 9164/10000 (92%)


Test set: Avg. loss: 0.0005, Accuracy: 9215/10000 (92%)



The test accuracy of N0 is not horrible, but still lower at 92% in comparison to N2's 96%. N2 generalizes better because it has parameters transferred from N1, which is trained on handwritte letters instead of digits, but still trained to recognize certain features of images that distinguish different objects. 

# Problem 5: Compare results to N2Rand

The results from training N2 on 1000 data is already shown in problem 4, so for N2 I will only show the result from training on the full 50K data set 

In [0]:
# Some parameters>
numtrain=50000
model_name="N2_full"
# Run epochs
for i in range(num_epochs):
    run_epoch(N2,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2,val,batch_size)
# Test on test set.
net_test(N2,test,batch_size,ttype='test')
# Save model
torch.save(N2.state_dict(), datadir+model_name)

100%|██████████| 50000/50000 [00:00<00:00, 54686.06it/s, loss=0.363]
 12%|█▏        | 6000/50000 [00:00<00:00, 56478.17it/s, loss=0.211]


Training set epoch 0: Avg. loss: 0.0005, Accuracy: 46586/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9633/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 56044.18it/s, loss=0.337]
 13%|█▎        | 6500/50000 [00:00<00:00, 60876.46it/s, loss=0.213]


Training set epoch 1: Avg. loss: 0.0005, Accuracy: 46636/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9636/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 60182.66it/s, loss=0.321]
 12%|█▏        | 6000/50000 [00:00<00:00, 58293.49it/s, loss=0.174]


Training set epoch 2: Avg. loss: 0.0005, Accuracy: 46679/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9641/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 56797.63it/s, loss=0.28]
 12%|█▏        | 6000/50000 [00:00<00:00, 56049.73it/s, loss=0.209]


Training set epoch 3: Avg. loss: 0.0004, Accuracy: 46796/50000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9641/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 56638.96it/s, loss=0.322]
 12%|█▏        | 6000/50000 [00:00<00:00, 59079.14it/s, loss=0.153]


Training set epoch 4: Avg. loss: 0.0004, Accuracy: 46765/50000 (94%)


Validation set: Avg. loss: 0.0003, Accuracy: 9646/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58488.39it/s, loss=0.289]
 12%|█▏        | 6000/50000 [00:00<00:00, 58670.35it/s, loss=0.199]


Training set epoch 5: Avg. loss: 0.0004, Accuracy: 46742/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9653/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 57217.30it/s, loss=0.313]
 12%|█▏        | 6000/50000 [00:00<00:00, 58670.76it/s, loss=0.189]


Training set epoch 6: Avg. loss: 0.0004, Accuracy: 46750/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9654/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 59329.69it/s, loss=0.33]
 12%|█▏        | 6000/50000 [00:00<00:00, 58596.85it/s, loss=0.178]


Training set epoch 7: Avg. loss: 0.0004, Accuracy: 46755/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9655/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 57810.73it/s, loss=0.308]
 12%|█▏        | 6000/50000 [00:00<00:00, 57022.04it/s, loss=0.185]


Training set epoch 8: Avg. loss: 0.0004, Accuracy: 46776/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9657/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 58426.01it/s, loss=0.334]
 12%|█▏        | 6000/50000 [00:00<00:00, 56887.47it/s, loss=0.186]


Training set epoch 9: Avg. loss: 0.0004, Accuracy: 46880/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9657/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 56561.01it/s, loss=0.29]
 12%|█▏        | 6000/50000 [00:00<00:00, 56520.91it/s, loss=0.254]


Training set epoch 10: Avg. loss: 0.0004, Accuracy: 46774/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9660/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 57027.48it/s, loss=0.295]
 13%|█▎        | 6500/50000 [00:00<00:00, 62415.52it/s, loss=0.21] 


Training set epoch 11: Avg. loss: 0.0004, Accuracy: 46799/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9661/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 58079.40it/s, loss=0.275]
 12%|█▏        | 6000/50000 [00:00<00:00, 56227.68it/s, loss=0.169]


Training set epoch 12: Avg. loss: 0.0004, Accuracy: 46919/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9664/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 58638.81it/s, loss=0.291]
 12%|█▏        | 6000/50000 [00:00<00:00, 59841.40it/s, loss=0.179]


Training set epoch 13: Avg. loss: 0.0004, Accuracy: 46918/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9670/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 56226.54it/s, loss=0.28]
 13%|█▎        | 6500/50000 [00:00<00:00, 60427.55it/s, loss=0.163]


Training set epoch 14: Avg. loss: 0.0004, Accuracy: 46854/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9667/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 57280.27it/s, loss=0.264]
 12%|█▏        | 6000/50000 [00:00<00:00, 59410.95it/s, loss=0.154]


Training set epoch 15: Avg. loss: 0.0004, Accuracy: 46901/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9666/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 56283.94it/s, loss=0.312]
 12%|█▏        | 6000/50000 [00:00<00:00, 57437.60it/s, loss=0.154]


Training set epoch 16: Avg. loss: 0.0004, Accuracy: 46964/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9671/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 58515.28it/s, loss=0.286]
 11%|█         | 5500/50000 [00:00<00:00, 52958.99it/s, loss=0.216]


Training set epoch 17: Avg. loss: 0.0004, Accuracy: 46906/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9666/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 56011.13it/s, loss=0.297]
 12%|█▏        | 6000/50000 [00:00<00:00, 57148.04it/s, loss=0.175]


Training set epoch 18: Avg. loss: 0.0004, Accuracy: 46913/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9671/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 57118.06it/s, loss=0.289]



Training set epoch 19: Avg. loss: 0.0004, Accuracy: 46949/50000 (94%)


Validation set: Avg. loss: 0.0002, Accuracy: 9672/10000 (97%)


Test set: Avg. loss: 0.0002, Accuracy: 9672/10000 (97%)



As expected, with more data, the test accuracy is slightly better at 97% compared to 96% with just 1000 digits. It seems that there isn't much benefit to increase training data for this model.

Next I compare the results to N2rand

In [0]:
N2rand = N2_Net(p=dropout_p,minimizer = minimizer)
N2rand.to(device)
PP = []
for name, param in N2rand.named_parameters():
  print(name,param.shape)
  if 'fc10' in name:
    PP.append(param)
N2rand.optimizer = torch.optim.Adam(PP,lr=step_size)

conv1.weight torch.Size([32, 1, 5, 5])
conv1.bias torch.Size([32])
conv2.weight torch.Size([64, 32, 5, 5])
conv2.bias torch.Size([64])
fc10.weight torch.Size([10, 1024])
fc10.bias torch.Size([10])


In [0]:
# Some parameters>
numtrain=1000
model_name="N2rand_small"

# Run epochs
for i in range(num_epochs):
    run_epoch(N2rand,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2rand,val,batch_size)
# Test on test set.
net_test(N2rand,test,batch_size,ttype='test')
# Save model
torch.save(N2rand.state_dict(), datadir+model_name)

100%|██████████| 1000/1000 [00:00<00:00, 39897.50it/s, loss=2.31]
100%|██████████| 1000/1000 [00:00<00:00, 42477.41it/s, loss=2.28]
100%|██████████| 1000/1000 [00:00<00:00, 48457.13it/s, loss=2.25]



Training set epoch 0: Avg. loss: 0.0046, Accuracy: 98/1000 (10%)


Validation set: Avg. loss: 0.0046, Accuracy: 2323/10000 (23%)


Training set epoch 1: Avg. loss: 0.0046, Accuracy: 158/1000 (16%)


Validation set: Avg. loss: 0.0045, Accuracy: 4750/10000 (48%)


Training set epoch 2: Avg. loss: 0.0045, Accuracy: 249/1000 (25%)



100%|██████████| 1000/1000 [00:00<00:00, 54665.29it/s, loss=2.22]
100%|██████████| 1000/1000 [00:00<00:00, 51542.91it/s, loss=2.18]
100%|██████████| 1000/1000 [00:00<00:00, 56609.40it/s, loss=2.16]


Validation set: Avg. loss: 0.0044, Accuracy: 4461/10000 (45%)


Training set epoch 3: Avg. loss: 0.0044, Accuracy: 318/1000 (32%)


Validation set: Avg. loss: 0.0044, Accuracy: 4340/10000 (43%)


Training set epoch 4: Avg. loss: 0.0044, Accuracy: 393/1000 (39%)


Validation set: Avg. loss: 0.0043, Accuracy: 5081/10000 (51%)


Training set epoch 5: Avg. loss: 0.0043, Accuracy: 435/1000 (44%)




100%|██████████| 1000/1000 [00:00<00:00, 56951.46it/s, loss=2.12]
100%|██████████| 1000/1000 [00:00<00:00, 61958.84it/s, loss=2.08]
100%|██████████| 1000/1000 [00:00<00:00, 50084.23it/s, loss=2.06]


Validation set: Avg. loss: 0.0043, Accuracy: 5964/10000 (60%)


Training set epoch 6: Avg. loss: 0.0042, Accuracy: 542/1000 (54%)


Validation set: Avg. loss: 0.0042, Accuracy: 6654/10000 (67%)


Training set epoch 7: Avg. loss: 0.0042, Accuracy: 606/1000 (61%)


Validation set: Avg. loss: 0.0041, Accuracy: 7148/10000 (71%)


Training set epoch 8: Avg. loss: 0.0041, Accuracy: 662/1000 (66%)




100%|██████████| 1000/1000 [00:00<00:00, 55507.38it/s, loss=2.04]
100%|██████████| 1000/1000 [00:00<00:00, 61265.60it/s, loss=2.01]
100%|██████████| 1000/1000 [00:00<00:00, 46632.39it/s, loss=1.97]


Validation set: Avg. loss: 0.0041, Accuracy: 7488/10000 (75%)


Training set epoch 9: Avg. loss: 0.0041, Accuracy: 685/1000 (68%)


Validation set: Avg. loss: 0.0040, Accuracy: 7674/10000 (77%)


Training set epoch 10: Avg. loss: 0.0040, Accuracy: 727/1000 (73%)


Validation set: Avg. loss: 0.0040, Accuracy: 7782/10000 (78%)


Training set epoch 11: Avg. loss: 0.0039, Accuracy: 755/1000 (76%)




100%|██████████| 1000/1000 [00:00<00:00, 50379.61it/s, loss=1.95]
100%|██████████| 1000/1000 [00:00<00:00, 45324.72it/s, loss=1.92]
100%|██████████| 1000/1000 [00:00<00:00, 53908.59it/s, loss=1.9]


Validation set: Avg. loss: 0.0039, Accuracy: 7860/10000 (79%)


Training set epoch 12: Avg. loss: 0.0039, Accuracy: 755/1000 (76%)


Validation set: Avg. loss: 0.0039, Accuracy: 7928/10000 (79%)


Training set epoch 13: Avg. loss: 0.0038, Accuracy: 768/1000 (77%)


Validation set: Avg. loss: 0.0038, Accuracy: 7974/10000 (80%)


Training set epoch 14: Avg. loss: 0.0038, Accuracy: 773/1000 (77%)




100%|██████████| 1000/1000 [00:00<00:00, 55525.75it/s, loss=1.86]
100%|██████████| 1000/1000 [00:00<00:00, 49587.44it/s, loss=1.85]
100%|██████████| 1000/1000 [00:00<00:00, 52652.57it/s, loss=1.81]


Validation set: Avg. loss: 0.0038, Accuracy: 7996/10000 (80%)


Training set epoch 15: Avg. loss: 0.0037, Accuracy: 791/1000 (79%)


Validation set: Avg. loss: 0.0037, Accuracy: 8007/10000 (80%)


Training set epoch 16: Avg. loss: 0.0037, Accuracy: 806/1000 (81%)


Validation set: Avg. loss: 0.0037, Accuracy: 8003/10000 (80%)


Training set epoch 17: Avg. loss: 0.0036, Accuracy: 790/1000 (79%)




100%|██████████| 1000/1000 [00:00<00:00, 52508.22it/s, loss=1.8]
100%|██████████| 1000/1000 [00:00<00:00, 51161.90it/s, loss=1.77]



Validation set: Avg. loss: 0.0036, Accuracy: 7999/10000 (80%)


Training set epoch 18: Avg. loss: 0.0036, Accuracy: 792/1000 (79%)


Validation set: Avg. loss: 0.0036, Accuracy: 7999/10000 (80%)


Training set epoch 19: Avg. loss: 0.0035, Accuracy: 798/1000 (80%)


Validation set: Avg. loss: 0.0035, Accuracy: 8014/10000 (80%)


Test set: Avg. loss: 0.0035, Accuracy: 8207/10000 (82%)



In [0]:
# Some parameters>
numtrain= 50000
model_name="N2rand_full"

# Run epochs
for i in range(num_epochs):
    run_epoch(N2rand,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2rand,val,batch_size)
# Test on test set.
net_test(N2rand,test,batch_size,ttype='test')
# Save model
torch.save(N2rand.state_dict(), datadir+model_name)

100%|██████████| 50000/50000 [00:00<00:00, 53126.99it/s, loss=1.14]
 13%|█▎        | 6500/50000 [00:00<00:00, 59676.90it/s, loss=0.953]


Training set epoch 0: Avg. loss: 0.0027, Accuracy: 40511/50000 (81%)


Validation set: Avg. loss: 0.0020, Accuracy: 8715/10000 (87%)



100%|██████████| 50000/50000 [00:00<00:00, 58412.72it/s, loss=0.872]
 13%|█▎        | 6500/50000 [00:00<00:00, 58978.22it/s, loss=0.653]


Training set epoch 1: Avg. loss: 0.0018, Accuracy: 42633/50000 (85%)


Validation set: Avg. loss: 0.0014, Accuracy: 8942/10000 (89%)



100%|██████████| 50000/50000 [00:00<00:00, 58739.62it/s, loss=0.731]
 13%|█▎        | 6500/50000 [00:00<00:00, 61748.70it/s, loss=0.563]


Training set epoch 2: Avg. loss: 0.0014, Accuracy: 43621/50000 (87%)


Validation set: Avg. loss: 0.0011, Accuracy: 9066/10000 (91%)



100%|██████████| 50000/50000 [00:00<00:00, 59536.71it/s, loss=0.657]
 12%|█▏        | 6000/50000 [00:00<00:00, 56413.22it/s, loss=0.447]


Training set epoch 3: Avg. loss: 0.0012, Accuracy: 44314/50000 (89%)


Validation set: Avg. loss: 0.0009, Accuracy: 9144/10000 (91%)



100%|██████████| 50000/50000 [00:00<00:00, 58312.84it/s, loss=0.576]
 13%|█▎        | 6500/50000 [00:00<00:00, 59107.45it/s, loss=0.438]


Training set epoch 4: Avg. loss: 0.0010, Accuracy: 44711/50000 (89%)


Validation set: Avg. loss: 0.0008, Accuracy: 9197/10000 (92%)



100%|██████████| 50000/50000 [00:00<00:00, 57537.72it/s, loss=0.532]
 12%|█▏        | 6000/50000 [00:00<00:00, 56559.15it/s, loss=0.35] 


Training set epoch 5: Avg. loss: 0.0009, Accuracy: 44962/50000 (90%)


Validation set: Avg. loss: 0.0008, Accuracy: 9259/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 59534.12it/s, loss=0.509]
 12%|█▏        | 6000/50000 [00:00<00:00, 58909.45it/s, loss=0.333]


Training set epoch 6: Avg. loss: 0.0008, Accuracy: 45250/50000 (90%)


Validation set: Avg. loss: 0.0007, Accuracy: 9288/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 59174.35it/s, loss=0.495]
 13%|█▎        | 6500/50000 [00:00<00:00, 59117.72it/s, loss=0.346]


Training set epoch 7: Avg. loss: 0.0008, Accuracy: 45473/50000 (91%)


Validation set: Avg. loss: 0.0006, Accuracy: 9322/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 56914.44it/s, loss=0.435]
 12%|█▏        | 6000/50000 [00:00<00:00, 55758.88it/s, loss=0.297]


Training set epoch 8: Avg. loss: 0.0007, Accuracy: 45636/50000 (91%)


Validation set: Avg. loss: 0.0006, Accuracy: 9349/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 55622.64it/s, loss=0.452]
 12%|█▏        | 6000/50000 [00:00<00:00, 56821.58it/s, loss=0.283]


Training set epoch 9: Avg. loss: 0.0007, Accuracy: 45814/50000 (92%)


Validation set: Avg. loss: 0.0006, Accuracy: 9368/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 55216.37it/s, loss=0.406]
 13%|█▎        | 6500/50000 [00:00<00:00, 62322.07it/s, loss=0.288]


Training set epoch 10: Avg. loss: 0.0007, Accuracy: 45932/50000 (92%)


Validation set: Avg. loss: 0.0005, Accuracy: 9387/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 59143.29it/s, loss=0.397]
 12%|█▏        | 6000/50000 [00:00<00:00, 57360.23it/s, loss=0.251]


Training set epoch 11: Avg. loss: 0.0007, Accuracy: 45997/50000 (92%)


Validation set: Avg. loss: 0.0005, Accuracy: 9411/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 56879.89it/s, loss=0.39]
 12%|█▏        | 6000/50000 [00:00<00:00, 57189.60it/s, loss=0.256]


Training set epoch 12: Avg. loss: 0.0006, Accuracy: 46147/50000 (92%)


Validation set: Avg. loss: 0.0005, Accuracy: 9427/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 59479.04it/s, loss=0.371]
 12%|█▏        | 6000/50000 [00:00<00:00, 57803.97it/s, loss=0.248]


Training set epoch 13: Avg. loss: 0.0006, Accuracy: 46231/50000 (92%)


Validation set: Avg. loss: 0.0005, Accuracy: 9450/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 57904.53it/s, loss=0.382]
 12%|█▏        | 6000/50000 [00:00<00:00, 59205.62it/s, loss=0.227]


Training set epoch 14: Avg. loss: 0.0006, Accuracy: 46336/50000 (93%)


Validation set: Avg. loss: 0.0005, Accuracy: 9461/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 56838.22it/s, loss=0.37]
 13%|█▎        | 6500/50000 [00:00<00:00, 58432.63it/s, loss=0.244]


Training set epoch 15: Avg. loss: 0.0006, Accuracy: 46354/50000 (93%)


Validation set: Avg. loss: 0.0005, Accuracy: 9474/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 58343.29it/s, loss=0.344]
 12%|█▏        | 6000/50000 [00:00<00:00, 58078.78it/s, loss=0.218]


Training set epoch 16: Avg. loss: 0.0006, Accuracy: 46441/50000 (93%)


Validation set: Avg. loss: 0.0004, Accuracy: 9479/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 57785.80it/s, loss=0.34]
 12%|█▏        | 6000/50000 [00:00<00:00, 57697.94it/s, loss=0.258]


Training set epoch 17: Avg. loss: 0.0005, Accuracy: 46528/50000 (93%)


Validation set: Avg. loss: 0.0004, Accuracy: 9487/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 56201.49it/s, loss=0.336]
 12%|█▏        | 6000/50000 [00:00<00:00, 59262.79it/s, loss=0.245]


Training set epoch 18: Avg. loss: 0.0005, Accuracy: 46646/50000 (93%)


Validation set: Avg. loss: 0.0004, Accuracy: 9499/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 58258.66it/s, loss=0.331]



Training set epoch 19: Avg. loss: 0.0005, Accuracy: 46589/50000 (93%)


Validation set: Avg. loss: 0.0004, Accuracy: 9497/10000 (95%)


Test set: Avg. loss: 0.0004, Accuracy: 9512/10000 (95%)



While increasing the number of training data only had marginal effects on N2, it has great effect on N2rand. With only 1000 data for training, N2rand perfroms very badly with only 80% accuracy. With the full training set, it has 95% accuracy. This is because N2 has it's parameters transferred from N1, so all of it's parameters have already been trained to some extent to recognize certain features of any image. N2rand on the other hand can only train the parameters in fc10 layer and all the parameters in convolution layer are random and aren't getting trained.  , so having more data to train those few parameters can substantially improve the performance. 

# Problem 6: Test the neural nets on transformed data

I test N2 trained on full data set, N2rand trained on full data set, and N0 trained on full data set. Since I have all the models already except for N0 trained on the full data set, I train and save it below:

---



In [0]:
# Some parameters>
numtrain= 50000
model_name="N0_full"

# Run epochs
for i in range(num_epochs):
    run_epoch(N0,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N0,val,batch_size)
# Test on test set.
net_test(N0,test,batch_size,ttype='test')
# Save model
torch.save(N0.state_dict(), datadir+model_name)

100%|██████████| 50000/50000 [00:00<00:00, 51191.96it/s, loss=0.22]
 12%|█▏        | 6000/50000 [00:00<00:00, 55945.56it/s, loss=0.142]


Training set epoch 0: Avg. loss: 0.0004, Accuracy: 46735/50000 (93%)


Validation set: Avg. loss: 0.0002, Accuracy: 9747/10000 (97%)



100%|██████████| 50000/50000 [00:00<00:00, 53933.63it/s, loss=0.133]
 11%|█         | 5500/50000 [00:00<00:00, 52243.81it/s, loss=0.139]


Training set epoch 1: Avg. loss: 0.0002, Accuracy: 48239/50000 (96%)


Validation set: Avg. loss: 0.0001, Accuracy: 9837/10000 (98%)



100%|██████████| 50000/50000 [00:00<00:00, 53511.85it/s, loss=0.139]
 11%|█         | 5500/50000 [00:00<00:00, 52502.30it/s, loss=0.0911]


Training set epoch 2: Avg. loss: 0.0002, Accuracy: 48603/50000 (97%)


Validation set: Avg. loss: 0.0001, Accuracy: 9872/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52718.06it/s, loss=0.116]
 11%|█         | 5500/50000 [00:00<00:00, 51919.98it/s, loss=0.0856]


Training set epoch 3: Avg. loss: 0.0002, Accuracy: 48799/50000 (98%)


Validation set: Avg. loss: 0.0001, Accuracy: 9887/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 53235.92it/s, loss=0.0857]
 11%|█         | 5500/50000 [00:00<00:00, 51237.41it/s, loss=0.0593]


Training set epoch 4: Avg. loss: 0.0001, Accuracy: 48982/50000 (98%)


Validation set: Avg. loss: 0.0001, Accuracy: 9900/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52259.09it/s, loss=0.0652]
 10%|█         | 5000/50000 [00:00<00:00, 49647.43it/s, loss=0.06]  


Training set epoch 5: Avg. loss: 0.0001, Accuracy: 49078/50000 (98%)


Validation set: Avg. loss: 0.0001, Accuracy: 9909/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52978.59it/s, loss=0.0943]
 12%|█▏        | 6000/50000 [00:00<00:00, 55942.33it/s, loss=0.0548]


Training set epoch 6: Avg. loss: 0.0001, Accuracy: 49192/50000 (98%)


Validation set: Avg. loss: 0.0001, Accuracy: 9907/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 54138.58it/s, loss=0.0813]
 12%|█▏        | 6000/50000 [00:00<00:00, 56566.14it/s, loss=0.059] 


Training set epoch 7: Avg. loss: 0.0001, Accuracy: 49250/50000 (98%)


Validation set: Avg. loss: 0.0001, Accuracy: 9919/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 54205.26it/s, loss=0.0455]
 11%|█         | 5500/50000 [00:00<00:00, 54103.42it/s, loss=0.0527]


Training set epoch 8: Avg. loss: 0.0001, Accuracy: 49251/50000 (99%)


Validation set: Avg. loss: 0.0001, Accuracy: 9912/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 53296.66it/s, loss=0.0572]
 11%|█         | 5500/50000 [00:00<00:00, 53599.71it/s, loss=0.0575]


Training set epoch 9: Avg. loss: 0.0001, Accuracy: 49326/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9920/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 53485.19it/s, loss=0.0384]
 11%|█         | 5500/50000 [00:00<00:00, 53655.94it/s, loss=0.0453]


Training set epoch 10: Avg. loss: 0.0001, Accuracy: 49387/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9927/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52570.25it/s, loss=0.0508]
 12%|█▏        | 6000/50000 [00:00<00:00, 55878.73it/s, loss=0.0588]


Training set epoch 11: Avg. loss: 0.0001, Accuracy: 49408/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9933/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 53212.78it/s, loss=0.0472]
 11%|█         | 5500/50000 [00:00<00:00, 54280.89it/s, loss=0.0604]


Training set epoch 12: Avg. loss: 0.0001, Accuracy: 49441/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9927/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52983.90it/s, loss=0.0444]
 11%|█         | 5500/50000 [00:00<00:00, 53718.03it/s, loss=0.0264]


Training set epoch 13: Avg. loss: 0.0001, Accuracy: 49490/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9934/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52978.56it/s, loss=0.0326]
 11%|█         | 5500/50000 [00:00<00:00, 53612.04it/s, loss=0.0303]


Training set epoch 14: Avg. loss: 0.0001, Accuracy: 49486/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9928/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52814.19it/s, loss=0.0325]
 11%|█         | 5500/50000 [00:00<00:00, 53210.51it/s, loss=0.0242]


Training set epoch 15: Avg. loss: 0.0001, Accuracy: 49492/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9939/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 52633.31it/s, loss=0.0494]
 11%|█         | 5500/50000 [00:00<00:00, 53017.78it/s, loss=0.0245]


Training set epoch 16: Avg. loss: 0.0001, Accuracy: 49541/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9932/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 53523.63it/s, loss=0.0507]
 12%|█▏        | 6000/50000 [00:00<00:00, 55537.27it/s, loss=0.0241]


Training set epoch 17: Avg. loss: 0.0001, Accuracy: 49555/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9928/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 54360.27it/s, loss=0.0373]
 12%|█▏        | 6000/50000 [00:00<00:00, 54857.74it/s, loss=0.0418]


Training set epoch 18: Avg. loss: 0.0001, Accuracy: 49593/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9934/10000 (99%)



100%|██████████| 50000/50000 [00:00<00:00, 53159.45it/s, loss=0.0423]



Training set epoch 19: Avg. loss: 0.0001, Accuracy: 49568/50000 (99%)


Validation set: Avg. loss: 0.0000, Accuracy: 9936/10000 (99%)


Test set: Avg. loss: 0.0001, Accuracy: 9918/10000 (99%)



In [0]:
train_tr, val_tr,test_tr = get_data(data_set='trans')

(70000, 1, 28, 28)


In [0]:
N0_full = N0_Net(p = dropout_p, minimizer=minimizer)
N0_full.to(device)
model_name = "N0_full"
#load saved model
state_dict = torch.load(datadir+model_name, map_location = device)
N0_full.load_state_dict(state_dict)
net_test(N0_full,test_tr,batch_size,ttype='test')


Test set: Avg. loss: 0.0062, Accuracy: 5379/10000 (54%)



In [0]:
N2_full = N2_Net(p = dropout_p, minimizer=minimizer)
N2_full.to(device)
model_name = "N2_full"
#load saved model
state_dict = torch.load(datadir+model_name, map_location = device)
N2_full.load_state_dict(state_dict)
net_test(N2_full,test_tr,batch_size,ttype='test')


Test set: Avg. loss: 0.0078, Accuracy: 3735/10000 (37%)



In [0]:
N2rand_full = N2_Net(p = dropout_p, minimizer=minimizer)
N2rand_full.to(device)
model_name = "N2rand_full"
#load saved model
state_dict = torch.load(datadir+model_name, map_location = device)
N2rand_full.load_state_dict(state_dict)
net_test(N2rand_full,test_tr,batch_size,ttype='test')


Test set: Avg. loss: 0.0058, Accuracy: 3262/10000 (33%)



N2 and N2rand don't perform any better than N0. In fact, they perform worse, although some fine-tuning of model parameters such as temperature used in softmax may improve it a bit. 

While N2 has advantage over N0 in the sense that it requires smaller number of training data to yield a decent performance, when testing on transformed data, N2 doesn't perform well because it only optimizes for parameters in fc10 based on the labels. The other parameters in the convolutional layers are trained based on simplified SimCLR framework, which only included shifting and scaling of handwritte letters, a set of data that is not only different from the digits data, but also more limited in variety because it doesn't include rotations that the transformed MNIST data include. 

We can see that when we compare N0 and N2 models that are trained on small training set, N2 performs better than N0 on the transformed MNIST data. 

In [0]:
N0_small = N0_Net(p = dropout_p, minimizer=minimizer)
N0_small.to(device)
model_name = "N0_small"
#load saved model
state_dict = torch.load(datadir+model_name, map_location = device)
N0_small.load_state_dict(state_dict)
net_test(N0_small,test_tr,batch_size,ttype='test')


Test set: Avg. loss: 0.0077, Accuracy: 2961/10000 (30%)



In [0]:
N2_small = N2_Net(p = dropout_p, minimizer=minimizer)
N2_small.to(device)
model_name = "N2_small"
#load saved model
state_dict = torch.load(datadir+model_name, map_location = device)
N2_small.load_state_dict(state_dict)
net_test(N2_small,test_tr,batch_size,ttype='test')


Test set: Avg. loss: 0.0067, Accuracy: 3582/10000 (36%)



# Problem 7

## (a): Use the alternative loss function proposed in problem 2

The alternative I proposed in problem 2 is to use negative euclidean distance as similarity measure instead of cosine similarity. 

In [0]:
class NT_Xent_euc(nn.Module):
  def __init__(self):
    super(NT_Xent_euc,self).__init__()
  
  def forward(self,o,o_tilde):
    # The inputs o and o_tilde are the representations of images and augmented images
    # passed through encoder 
    
    # normalize feature vectors
    o = F.normalize(o,dim=1)
    o_tilde = F.normalize(o_tilde,dim=1)

    # Compute similarity matrix
    uv = torch.cat([o,o_tilde],dim=0)
    uv_norm = (uv**2).sum(1).view(-1,1)
    uv_norm_prime = uv_norm.view(1,-1)
    sim_mat = -1 * (uv_norm + uv_norm_prime - 2.0*torch.mm(uv,torch.transpose(uv,0,1)))

    # Compute softmax 
    sim_sums = torch.sum(sim_mat,dim=1)
    self_sim = torch.diag(sim_mat)
    denoms = sim_sums - self_sim
    num = -1*(torch.pow(o-o_tilde,2).sum(1))
    nums = torch.cat([num,num],dim=0)
    softmaxes = torch.div(nums,denoms)

    # Compute negative loss 
    neglog_losses = -torch.log(softmaxes)
    loss = torch.mean(neglog_losses)
    return loss

In [0]:
class N1_Net_Euc(nn.Module):
    def __init__(self,p=0.5,minimizer='Adam'):
        super(N1_Net_Euc, self).__init__()
        # 32 output features using 5x5 kernel applied to input image
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        # 64 output features using 5x5 kernel applied to 32 features of previous layer.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        # Dropout - zero out some output features so weights aren't updated.
        self.conv2_drop = nn.Dropout2d(p)
        # 64 x 4 x 4 = 1024 units total in final spartial layer fully connected to 64 unit later
        self.fc64 = nn.Linear(1024, 64)
        if minimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.parameters(), lr = step_size)
        else:
            self.optimizer = torch.optim.SGD(self.parameters(), lr = step_size, momentum=0.9)
        self.first=True
        # negative log-likelihood loss 
        self.criterion=nn.CrossEntropyLoss()
            
    def forward(self, x):

        # Apply first conv then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        # Apply second conv then drop, then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        # Reshape 64 x 4 x 4 to 1024 units
        x = x.view(-1, 1024)
        # Apply fully connected layer with non-linearity relu, final 64 units 
        x = self.fc64(x)
        return x
    
    def get_acc_and_loss(self, data):
        # Apply network to batch input
        o = self.forward(data)
        affined = affine(data)
        o_tilde = self.forward(affined)
        # Comput loss between logit output and targ (correct class labels)
        loss = NT_Xent_euc()(o,o_tilde)
        # Also compute correct classification rate        
        return loss
        
    def run_grad(self,data):

        # Compute loss and accuracy
        loss =self.get_acc_and_loss(data)
        # Zero out gradients
        self.optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters based on gradients
        self.optimizer.step()
        
        return loss

In [0]:
numtrain=124800
data_set="letters"
model_name="N1_Euc"
train = get_data(data_set=data_set)
# Initialize the model
N1_Euc = N1_Net_Euc(p = dropout_p, minimizer=minimizer)
N1_Euc.to(device)
# Run epochs
train_err = []
for i in range(num_epochs):
    train_err.append(run_epoch_simCLR(N1_Euc,i,train,batch_size, num=numtrain, ttype="train"))
# Save model
torch.save(N1_Euc.state_dict(), datadir+model_name)

100%|██████████| 124800/124800 [00:04<00:00, 28033.36it/s, loss=5.75]
  0%|          | 500/124800 [00:00<00:08, 14188.35it/s, loss=6.28]


Training set epoch 0: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28420.68it/s, loss=5.77]
  0%|          | 500/124800 [00:00<00:08, 15045.10it/s, loss=6.26]


Training set epoch 1: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28191.89it/s, loss=5.74]
  0%|          | 500/124800 [00:00<00:08, 14770.13it/s, loss=6.25]


Training set epoch 2: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28148.64it/s, loss=5.75]
  0%|          | 500/124800 [00:00<00:08, 14631.02it/s, loss=6.26]


Training set epoch 3: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28275.14it/s, loss=5.77]
  0%|          | 500/124800 [00:00<00:08, 14762.33it/s, loss=6.25]


Training set epoch 4: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 27934.91it/s, loss=5.74]
  0%|          | 500/124800 [00:00<00:08, 14976.45it/s, loss=6.27]


Training set epoch 5: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28147.30it/s, loss=5.77]
  0%|          | 500/124800 [00:00<00:08, 14924.97it/s, loss=6.27]


Training set epoch 6: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28181.59it/s, loss=5.74]
  0%|          | 500/124800 [00:00<00:08, 14489.70it/s, loss=6.24]


Training set epoch 7: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28298.21it/s, loss=5.73]
  0%|          | 500/124800 [00:00<00:07, 15850.29it/s, loss=6.24]


Training set epoch 8: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28549.75it/s, loss=5.72]
  0%|          | 500/124800 [00:00<00:08, 15301.98it/s, loss=6.25]


Training set epoch 9: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28527.45it/s, loss=5.73]
  0%|          | 0/124800 [00:00<?, ?it/s, loss=6.24]


Training set epoch 10: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28495.48it/s, loss=5.74]
  0%|          | 500/124800 [00:00<00:08, 15103.51it/s, loss=6.25]


Training set epoch 11: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28655.99it/s, loss=5.72]
  0%|          | 500/124800 [00:00<00:08, 15531.70it/s, loss=6.23]


Training set epoch 12: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28718.04it/s, loss=5.72]
  0%|          | 500/124800 [00:00<00:07, 15854.84it/s, loss=6.23]


Training set epoch 13: Avg. loss: 0.0063


100%|██████████| 124800/124800 [00:04<00:00, 28787.45it/s, loss=5.73]
  0%|          | 500/124800 [00:00<00:07, 15637.55it/s, loss=6.25]


Training set epoch 14: Avg. loss: 0.0062


100%|██████████| 124800/124800 [00:04<00:00, 28676.35it/s, loss=5.71]
  0%|          | 500/124800 [00:00<00:08, 15346.21it/s, loss=6.25]


Training set epoch 15: Avg. loss: 0.0062


100%|██████████| 124800/124800 [00:04<00:00, 28518.61it/s, loss=5.76]
  0%|          | 500/124800 [00:00<00:07, 15799.18it/s, loss=6.23]


Training set epoch 16: Avg. loss: 0.0062


100%|██████████| 124800/124800 [00:04<00:00, 28697.09it/s, loss=5.71]
  0%|          | 500/124800 [00:00<00:07, 15540.21it/s, loss=6.23]


Training set epoch 17: Avg. loss: 0.0062


100%|██████████| 124800/124800 [00:04<00:00, 28573.79it/s, loss=5.71]
  0%|          | 500/124800 [00:00<00:08, 14975.59it/s, loss=6.23]


Training set epoch 18: Avg. loss: 0.0062


100%|██████████| 124800/124800 [00:04<00:00, 28491.73it/s, loss=5.73]


Training set epoch 19: Avg. loss: 0.0062





In [0]:
model_name = "N1_Euc"
N1_Euc = N1_Net_Euc(p = dropout_p, minimizer=minimizer)
N1_Euc.to(device)
state_dict = torch.load(datadir+model_name, map_location = device)
N1_Euc.load_state_dict(state_dict)
N2_Euc = N2_Net(p=dropout_p,minimizer = minimizer)
N2_Euc.to(device)
# Get parameter sets of both networks
params = N1_Euc.named_parameters()
params2 = N2_Euc.named_parameters()
# Make a dictionary of the new one
dict_params2 = dict(params2)
# Loop over parameters of N1
for name, param in params:
  if name in dict_params2:
    dict_params2[name].data.copy_(param.data)
N2_Euc.load_state_dict(dict_params2)
PP = []
for name, param in N2_Euc.named_parameters():
  print(name,param.shape)
  if 'fc10' in name:
    PP.append(param)
N2_Euc.optimizer = torch.optim.Adam(PP,lr=step_size)

conv1.weight torch.Size([32, 1, 5, 5])
conv1.bias torch.Size([32])
conv2.weight torch.Size([64, 32, 5, 5])
conv2.bias torch.Size([64])
fc10.weight torch.Size([10, 1024])
fc10.bias torch.Size([10])


In [35]:
numtrain=50000
data_set="mnist"
model_name="N2_Euc"
# get data
train,val,test=get_data(data_set=data_set)
# Run epochs
for i in range(num_epochs):
    run_epoch(N2_Euc,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2_Euc,val,batch_size)
# Test on test set.
net_test(N2_Euc,test_tr,batch_size,ttype='test')
# Save model
torch.save(N2_Euc.state_dict(), datadir+model_name)

  6%|▌         | 3000/50000 [00:00<00:01, 36641.18it/s, loss=2.24]

(70000, 784)


100%|██████████| 50000/50000 [00:00<00:00, 53900.46it/s, loss=1.53]
 13%|█▎        | 6500/50000 [00:00<00:00, 60108.73it/s, loss=1.4] 


Training set epoch 0: Avg. loss: 0.0037, Accuracy: 34171/50000 (68%)


Validation set: Avg. loss: 0.0029, Accuracy: 8476/10000 (85%)



100%|██████████| 50000/50000 [00:00<00:00, 57644.65it/s, loss=1.16]
 12%|█▏        | 6000/50000 [00:00<00:00, 56684.51it/s, loss=1.07]


Training set epoch 1: Avg. loss: 0.0025, Accuracy: 41356/50000 (83%)


Validation set: Avg. loss: 0.0021, Accuracy: 8880/10000 (89%)



100%|██████████| 50000/50000 [00:00<00:00, 55855.34it/s, loss=0.97]
 12%|█▏        | 6000/50000 [00:00<00:00, 56316.14it/s, loss=0.882]


Training set epoch 2: Avg. loss: 0.0020, Accuracy: 42708/50000 (85%)


Validation set: Avg. loss: 0.0016, Accuracy: 9027/10000 (90%)



100%|██████████| 50000/50000 [00:00<00:00, 57783.05it/s, loss=0.838]
 13%|█▎        | 6500/50000 [00:00<00:00, 60908.83it/s, loss=0.735]


Training set epoch 3: Avg. loss: 0.0016, Accuracy: 43268/50000 (87%)


Validation set: Avg. loss: 0.0014, Accuracy: 9124/10000 (91%)



100%|██████████| 50000/50000 [00:00<00:00, 58807.04it/s, loss=0.751]
 12%|█▏        | 6000/50000 [00:00<00:00, 57121.96it/s, loss=0.695]


Training set epoch 4: Avg. loss: 0.0014, Accuracy: 43671/50000 (87%)


Validation set: Avg. loss: 0.0012, Accuracy: 9204/10000 (92%)



100%|██████████| 50000/50000 [00:00<00:00, 57465.18it/s, loss=0.698]
 13%|█▎        | 6500/50000 [00:00<00:00, 57142.98it/s, loss=0.583]


Training set epoch 5: Avg. loss: 0.0013, Accuracy: 44096/50000 (88%)


Validation set: Avg. loss: 0.0010, Accuracy: 9240/10000 (92%)



100%|██████████| 50000/50000 [00:00<00:00, 59009.13it/s, loss=0.644]
 12%|█▏        | 6000/50000 [00:00<00:00, 57632.67it/s, loss=0.55] 


Training set epoch 6: Avg. loss: 0.0012, Accuracy: 44309/50000 (89%)


Validation set: Avg. loss: 0.0009, Accuracy: 9281/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 58025.54it/s, loss=0.566]
 12%|█▏        | 6000/50000 [00:00<00:00, 54401.21it/s, loss=0.506]


Training set epoch 7: Avg. loss: 0.0011, Accuracy: 44611/50000 (89%)


Validation set: Avg. loss: 0.0008, Accuracy: 9318/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 56358.27it/s, loss=0.591]
 12%|█▏        | 6000/50000 [00:00<00:00, 55397.04it/s, loss=0.48] 


Training set epoch 8: Avg. loss: 0.0010, Accuracy: 44752/50000 (90%)


Validation set: Avg. loss: 0.0008, Accuracy: 9364/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 56015.44it/s, loss=0.509]
 13%|█▎        | 6500/50000 [00:00<00:00, 60349.83it/s, loss=0.478]


Training set epoch 9: Avg. loss: 0.0009, Accuracy: 44907/50000 (90%)


Validation set: Avg. loss: 0.0007, Accuracy: 9381/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 58881.61it/s, loss=0.498]
 13%|█▎        | 6500/50000 [00:00<00:00, 60287.51it/s, loss=0.442]


Training set epoch 10: Avg. loss: 0.0009, Accuracy: 45015/50000 (90%)


Validation set: Avg. loss: 0.0007, Accuracy: 9402/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 58813.09it/s, loss=0.462]
 13%|█▎        | 6500/50000 [00:00<00:00, 60827.43it/s, loss=0.393]


Training set epoch 11: Avg. loss: 0.0009, Accuracy: 45109/50000 (90%)


Validation set: Avg. loss: 0.0006, Accuracy: 9423/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 57915.72it/s, loss=0.467]
 13%|█▎        | 6500/50000 [00:00<00:00, 58898.70it/s, loss=0.391]


Training set epoch 12: Avg. loss: 0.0008, Accuracy: 45163/50000 (90%)


Validation set: Avg. loss: 0.0006, Accuracy: 9440/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 58442.21it/s, loss=0.465]
 13%|█▎        | 6500/50000 [00:00<00:00, 59084.69it/s, loss=0.381]


Training set epoch 13: Avg. loss: 0.0008, Accuracy: 45328/50000 (91%)


Validation set: Avg. loss: 0.0006, Accuracy: 9462/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 57355.51it/s, loss=0.451]
 13%|█▎        | 6500/50000 [00:00<00:00, 59491.28it/s, loss=0.352]


Training set epoch 14: Avg. loss: 0.0008, Accuracy: 45443/50000 (91%)


Validation set: Avg. loss: 0.0006, Accuracy: 9474/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 58079.74it/s, loss=0.432]
 12%|█▏        | 6000/50000 [00:00<00:00, 58415.54it/s, loss=0.373]


Training set epoch 15: Avg. loss: 0.0007, Accuracy: 45540/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9483/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 55771.82it/s, loss=0.4]
 13%|█▎        | 6500/50000 [00:00<00:00, 61778.78it/s, loss=0.342]


Training set epoch 16: Avg. loss: 0.0007, Accuracy: 45567/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9492/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 56638.22it/s, loss=0.413]
 12%|█▏        | 6000/50000 [00:00<00:00, 53760.60it/s, loss=0.347]


Training set epoch 17: Avg. loss: 0.0007, Accuracy: 45578/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9504/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 57835.56it/s, loss=0.405]
 13%|█▎        | 6500/50000 [00:00<00:00, 60375.89it/s, loss=0.325]


Training set epoch 18: Avg. loss: 0.0007, Accuracy: 45647/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9514/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 56530.86it/s, loss=0.382]



Training set epoch 19: Avg. loss: 0.0007, Accuracy: 45724/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9522/10000 (95%)


Test set: Avg. loss: 0.0051, Accuracy: 3871/10000 (39%)



Performance of negative euclidean distance is actually better than performance from default NT-Xent using cosine similarity. However this may simple due to lack of tuning on models using either loss. Nevertheless, there is not a substantial difference in using a different similarity measure, and it seems that the root cause of underperformance of models using simCLR framework is the lack of fine-tuning of embeddings on the actual digits data. 

# (b): Try different type of perturbation

I experiment on applying a full affine map to the image grid and compare the results using both NT-Xent loss as well as loss used in part (a).

In [0]:
def affine_full(x_in, factor=3):
  nn = x_in.shape[0]
  h = x_in.shape[2]
  w = x_in.shape[3]
  # for each batch element sample 6 random parameters 
  # for an affine transformation of the grid: 
  # factor controls the deviation from identity.
  u = ((torch.rand(nn,6)-0.5) * factor).to(device)
  # Add this random vector to this identity affine map:
  ID = torch.zeros(nn,6).to(device)
  ID[:,0] = 1
  ID[:,4] = 1
  theta = (u+ID).reshape(-1,2,3)
  # Creates the mapping of the deformed grid
  grid = F.affine_grid(theta, [nn,1,h,w] ,align_corners=True)
  # Applies this mapping to the image
  x_out = F.grid_sample(x_in, grid, padding_mode = 'border',align_corners=True)
  return x_out

In [0]:
class N1_Net_Full(nn.Module):
    def __init__(self,p=0.5,minimizer='Adam'):
        super(N1_Net_Full, self).__init__()
        # 32 output features using 5x5 kernel applied to input image
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        # 64 output features using 5x5 kernel applied to 32 features of previous layer.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        # Dropout - zero out some output features so weights aren't updated.
        self.conv2_drop = nn.Dropout2d(p)
        # 64 x 4 x 4 = 1024 units total in final spartial layer fully connected to 64 unit later
        self.fc64 = nn.Linear(1024, 64)
        if minimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.parameters(), lr = step_size)
        else:
            self.optimizer = torch.optim.SGD(self.parameters(), lr = step_size, momentum=0.9)
        self.first=True
        # negative log-likelihood loss 
        self.criterion=nn.CrossEntropyLoss()
            
    def forward(self, x):

        # Apply first conv then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        # Apply second conv then drop, then maxpool by factor of 2 then non-linearity relu
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        # Reshape 64 x 4 x 4 to 1024 units
        x = x.view(-1, 1024)
        # Apply fully connected layer with non-linearity relu, final 64 units 
        x = self.fc64(x)
        return x
    
    def get_acc_and_loss(self, data):
        # Apply network to batch input
        o = self.forward(data)
        affined = affine_full(data)
        o_tilde = self.forward(affined)
        # Comput loss between logit output and targ (correct class labels)
        loss = NT_Xent()(o,o_tilde)
        # Also compute correct classification rate        
        return loss
        
    def run_grad(self,data):

        # Compute loss and accuracy
        loss =self.get_acc_and_loss(data)
        # Zero out gradients
        self.optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters based on gradients
        self.optimizer.step()
        
        return loss

In [38]:
numtrain=124800
data_set="letters"
model_name="N1_fullaffine"
train = get_data(data_set=data_set)
# Initialize the model
N1_fullaffine = N1_Net_Full(p = dropout_p, minimizer=minimizer)
N1_fullaffine.to(device)
#define optimizer
# Run epochs
train_err = []
for i in range(num_epochs):
    train_err.append(run_epoch_simCLR(N1_fullaffine,i,train,batch_size, num=numtrain, ttype="train"))
# Save model
torch.save(net.state_dict(), datadir+model_name)

100%|██████████| 124800/124800 [00:04<00:00, 28208.81it/s, loss=-2.74]
  0%|          | 500/124800 [00:00<00:08, 14478.59it/s, loss=-2.21]


Training set epoch 0: Avg. loss: -0.0021


100%|██████████| 124800/124800 [00:04<00:00, 28582.86it/s, loss=-2.75]
  0%|          | 500/124800 [00:00<00:08, 15288.26it/s, loss=-2.24]


Training set epoch 1: Avg. loss: -0.0022


100%|██████████| 124800/124800 [00:04<00:00, 28545.85it/s, loss=-2.78]
  0%|          | 500/124800 [00:00<00:08, 14772.84it/s, loss=-2.26]


Training set epoch 2: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28253.94it/s, loss=-2.77]
  0%|          | 500/124800 [00:00<00:07, 15625.43it/s, loss=-2.26]


Training set epoch 3: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28441.69it/s, loss=-2.77]
  0%|          | 500/124800 [00:00<00:08, 14988.44it/s, loss=-2.26]


Training set epoch 4: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28533.71it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 14994.22it/s, loss=-2.26]


Training set epoch 5: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28580.45it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:07, 15807.52it/s, loss=-2.26]


Training set epoch 6: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28840.23it/s, loss=-2.78]
  0%|          | 500/124800 [00:00<00:07, 15900.28it/s, loss=-2.27]


Training set epoch 7: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28676.40it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:07, 15801.09it/s, loss=-2.27]


Training set epoch 8: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28984.25it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 15294.17it/s, loss=-2.27]


Training set epoch 9: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 29033.57it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 15475.88it/s, loss=-2.25]


Training set epoch 10: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28933.99it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 15206.56it/s, loss=-2.28]


Training set epoch 11: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28890.55it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:07, 15714.07it/s, loss=-2.27]


Training set epoch 12: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28985.98it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:07, 15641.75it/s, loss=-2.28]


Training set epoch 13: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 29218.95it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 15270.56it/s, loss=-2.28]


Training set epoch 14: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28893.44it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 15373.10it/s, loss=-2.28]


Training set epoch 15: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28705.17it/s, loss=-2.79]
  0%|          | 500/124800 [00:00<00:08, 15063.26it/s, loss=-2.29]


Training set epoch 16: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28842.03it/s, loss=-2.8]
  0%|          | 500/124800 [00:00<00:08, 14821.70it/s, loss=-2.28]


Training set epoch 17: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 28651.35it/s, loss=-2.81]
  0%|          | 500/124800 [00:00<00:08, 15156.77it/s, loss=-2.29]


Training set epoch 18: Avg. loss: -0.0023


100%|██████████| 124800/124800 [00:04<00:00, 29012.33it/s, loss=-2.79]



Training set epoch 19: Avg. loss: -0.0023


In [39]:
model_name = "N1_fullaffine"
N1_fullaffine = N1_Net_Full(p = dropout_p, minimizer=minimizer)
N1_fullaffine.to(device)
state_dict = torch.load(datadir+model_name, map_location = device)
N1_fullaffine.load_state_dict(state_dict)

N2_fullaffine = N2_Net(p=dropout_p,minimizer = minimizer)
N2_fullaffine.to(device)
# Get parameter sets of both networks
params = N1_fullaffine.named_parameters()
params2 = N2_fullaffine.named_parameters()
# Make a dictionary of the new one
dict_params2 = dict(params2)
# Loop over parameters of N1
for name, param in params:
  if name in dict_params2:
    dict_params2[name].data.copy_(param.data) 
N2_fullaffine.load_state_dict(dict_params2)

PP = []
for name, param in N2_fullaffine.named_parameters():
  print(name,param.shape)
  if 'fc10' in name:
    PP.append(param)
N2_fullaffine.optimizer = torch.optim.Adam(PP,lr=step_size)

conv1.weight torch.Size([32, 1, 5, 5])
conv1.bias torch.Size([32])
conv2.weight torch.Size([64, 32, 5, 5])
conv2.bias torch.Size([64])
fc10.weight torch.Size([10, 1024])
fc10.bias torch.Size([10])


In [40]:
numtrain=50000
data_set="mnist"
model_name="N2_fullaffine"
# get data
train,val,test=get_data(data_set=data_set)

# Run epochs
for i in range(num_epochs):
    run_epoch(N2_fullaffine,i,train,batch_size, num=numtrain, ttype="train")
    # Test on validation set.
    net_test(N2_fullaffine,val,batch_size)
# Test on test set.
net_test(N2_fullaffine,test,batch_size,ttype='test')
# Save model
torch.save(N2_fullaffine.state_dict(), datadir+model_name)

(70000, 784)


100%|██████████| 50000/50000 [00:00<00:00, 54844.37it/s, loss=1.02]
 13%|█▎        | 6500/50000 [00:00<00:00, 59833.42it/s, loss=0.798]


Training set epoch 0: Avg. loss: 0.0028, Accuracy: 37700/50000 (75%)


Validation set: Avg. loss: 0.0016, Accuracy: 9002/10000 (90%)



100%|██████████| 50000/50000 [00:00<00:00, 58621.50it/s, loss=0.728]
 12%|█▏        | 6000/50000 [00:00<00:00, 57311.24it/s, loss=0.535]


Training set epoch 1: Avg. loss: 0.0014, Accuracy: 43544/50000 (87%)


Validation set: Avg. loss: 0.0010, Accuracy: 9197/10000 (92%)



100%|██████████| 50000/50000 [00:00<00:00, 57821.94it/s, loss=0.606]
 12%|█▏        | 6000/50000 [00:00<00:00, 55122.21it/s, loss=0.498]


Training set epoch 2: Avg. loss: 0.0011, Accuracy: 44455/50000 (89%)


Validation set: Avg. loss: 0.0008, Accuracy: 9296/10000 (93%)



100%|██████████| 50000/50000 [00:00<00:00, 57720.67it/s, loss=0.52]
 12%|█▏        | 6000/50000 [00:00<00:00, 59834.01it/s, loss=0.378]


Training set epoch 3: Avg. loss: 0.0009, Accuracy: 45107/50000 (90%)


Validation set: Avg. loss: 0.0006, Accuracy: 9372/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 57733.63it/s, loss=0.512]
 12%|█▏        | 6000/50000 [00:00<00:00, 54767.76it/s, loss=0.352]


Training set epoch 4: Avg. loss: 0.0008, Accuracy: 45257/50000 (91%)


Validation set: Avg. loss: 0.0006, Accuracy: 9420/10000 (94%)



100%|██████████| 50000/50000 [00:00<00:00, 57861.85it/s, loss=0.467]
 13%|█▎        | 6500/50000 [00:00<00:00, 61111.99it/s, loss=0.323]


Training set epoch 5: Avg. loss: 0.0007, Accuracy: 45551/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9458/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 61174.93it/s, loss=0.43]
 12%|█▏        | 6000/50000 [00:00<00:00, 59766.79it/s, loss=0.296]


Training set epoch 6: Avg. loss: 0.0007, Accuracy: 45585/50000 (91%)


Validation set: Avg. loss: 0.0005, Accuracy: 9478/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 58945.29it/s, loss=0.423]
 12%|█▏        | 6000/50000 [00:00<00:00, 56760.83it/s, loss=0.273]


Training set epoch 7: Avg. loss: 0.0006, Accuracy: 45886/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9499/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 58756.82it/s, loss=0.415]
 11%|█         | 5500/50000 [00:00<00:00, 54676.50it/s, loss=0.295]


Training set epoch 8: Avg. loss: 0.0006, Accuracy: 46045/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9518/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 57140.80it/s, loss=0.386]
 12%|█▏        | 6000/50000 [00:00<00:00, 59321.74it/s, loss=0.257]


Training set epoch 9: Avg. loss: 0.0006, Accuracy: 46144/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9539/10000 (95%)



100%|██████████| 50000/50000 [00:00<00:00, 59730.94it/s, loss=0.365]
 12%|█▏        | 6000/50000 [00:00<00:00, 57091.25it/s, loss=0.245]


Training set epoch 10: Avg. loss: 0.0006, Accuracy: 46100/50000 (92%)


Validation set: Avg. loss: 0.0004, Accuracy: 9554/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58631.29it/s, loss=0.342]
 12%|█▏        | 6000/50000 [00:00<00:00, 58024.14it/s, loss=0.218]


Training set epoch 11: Avg. loss: 0.0005, Accuracy: 46324/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9572/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58901.55it/s, loss=0.369]
 13%|█▎        | 6500/50000 [00:00<00:00, 61034.14it/s, loss=0.245]


Training set epoch 12: Avg. loss: 0.0005, Accuracy: 46311/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9578/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 60372.63it/s, loss=0.34]
 12%|█▏        | 6000/50000 [00:00<00:00, 57011.71it/s, loss=0.225]


Training set epoch 13: Avg. loss: 0.0005, Accuracy: 46387/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9588/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 56119.05it/s, loss=0.356]
 13%|█▎        | 6500/50000 [00:00<00:00, 59740.33it/s, loss=0.203]


Training set epoch 14: Avg. loss: 0.0005, Accuracy: 46422/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9594/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 56786.62it/s, loss=0.362]
 12%|█▏        | 6000/50000 [00:00<00:00, 56130.24it/s, loss=0.257]


Training set epoch 15: Avg. loss: 0.0005, Accuracy: 46479/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9607/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58996.40it/s, loss=0.35]
 12%|█▏        | 6000/50000 [00:00<00:00, 56943.73it/s, loss=0.206]


Training set epoch 16: Avg. loss: 0.0005, Accuracy: 46530/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9608/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58736.22it/s, loss=0.328]
 13%|█▎        | 6500/50000 [00:00<00:00, 59608.79it/s, loss=0.202]


Training set epoch 17: Avg. loss: 0.0005, Accuracy: 46498/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9619/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 57439.35it/s, loss=0.335]
 13%|█▎        | 6500/50000 [00:00<00:00, 60193.40it/s, loss=0.189]


Training set epoch 18: Avg. loss: 0.0005, Accuracy: 46560/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9622/10000 (96%)



100%|██████████| 50000/50000 [00:00<00:00, 58673.20it/s, loss=0.347]



Training set epoch 19: Avg. loss: 0.0005, Accuracy: 46583/50000 (93%)


Validation set: Avg. loss: 0.0003, Accuracy: 9630/10000 (96%)


Test set: Avg. loss: 0.0003, Accuracy: 9629/10000 (96%)



In [41]:
N2_fullaffine = N2_Net(p = dropout_p, minimizer=minimizer)
N2_fullaffine.to(device)
model_name = "N2_fullaffine"
#load saved model
state_dict = torch.load(datadir+model_name, map_location = device)
N2_fullaffine.load_state_dict(state_dict)
net_test(N2_fullaffine,test_tr,batch_size,ttype='test')


Test set: Avg. loss: 0.0067, Accuracy: 3573/10000 (36%)



Applying a full affine map to image grid doesn't really improve the performance by much. It seems that the problem is not the limited range of affine transformation that we have been training N2 with, but rather that we are not fine-tuning the convolutional layer parameters transferred from N1 on digits images. Right now N2 is only trained by digits data through fc10 layer and image representations are not fine-tuned to the digits dataset. 